Compare commits
201 Commits
407329657f
...
v0.1.50
| Author | SHA1 | Date | |
|---|---|---|---|
| 3f6dffa66e | |||
| 1b64fbde3d | |||
| 4f4c3b0d6e | |||
| bd20f0cae1 | |||
| 11c9a5f9d2 | |||
| aeae60146d | |||
| a82c81e8e3 | |||
| c56e9d7be2 | |||
| c8f800a1e5 | |||
| fc6b3424cf | |||
| 35e06c6734 | |||
| 783d233319 | |||
| 6509a2b91a | |||
| 52f8c40b86 | |||
| a86b5ba8f9 | |||
| 1b964545be | |||
| 97aa1708c2 | |||
| d12689f3b5 | |||
| f22e3ee95e | |||
| e890c5e810 | |||
| 078c30a592 | |||
| a847674004 | |||
| 2618f6b62f | |||
| c3fc5a181d | |||
| 3f45a172b3 | |||
| 5b12c12228 | |||
| 651b801de3 | |||
| 71b9f93d7c | |||
| ae70946c61 | |||
| 2910b7d875 | |||
| 43242debce | |||
| a2519b2814 | |||
| 91f037aa3e | |||
| 627c533724 | |||
| b1bff4857b | |||
| f8a061d496 | |||
| e61a845965 | |||
| ac5d2d4db5 | |||
| 69892a2d84 | |||
| a928d73134 | |||
| af52d49194 | |||
| bc94f75328 | |||
| b6da71b7e7 | |||
| aaf7edfbce | |||
| bb72c42726 | |||
| af5f96ce2f | |||
| 8dffe18a23 | |||
| 0c544753f9 | |||
| c8e26b9bac | |||
| 60ef712fac | |||
| 1ed4666dfd | |||
| 59d260680e | |||
| 9160fac80b | |||
| 83cb43bcf1 | |||
| b310206f1f | |||
| f9bf3ce610 | |||
| 5f8c933844 | |||
| e61fd7fd76 | |||
| 64ceed6236 | |||
| 09dcd53da5 | |||
| 43196af70c | |||
| 1b3f8671c0 | |||
| 16ea853f5b | |||
| d463272cf2 | |||
| 17b5921d8d | |||
| 3d187c9220 | |||
| 4b54a59e35 | |||
| 8dd943e8f1 | |||
| fb6ee6d7ae | |||
| a7e237e2ff | |||
| c48a105c28 | |||
| 71671a8901 | |||
| f5d2ebeaec | |||
| 2d3844b5dd | |||
| 996a199050 | |||
| a991fbb942 | |||
| 7b7e323fd8 | |||
| 114ad52ae8 | |||
| 8978356c49 | |||
| b3c67f4b7f | |||
| 864cafd61f | |||
| 6a1324ba6c | |||
| ab28382d58 | |||
| 9df6106bf5 | |||
| 967244064f | |||
| 99da289183 | |||
| b0b1ea04a1 | |||
| b8afd15417 | |||
| 61287380d3 | |||
| 999e7b5db5 | |||
| c851590aaa | |||
| 6b18cdf562 | |||
| 1b46aa2f13 | |||
| 8cb5650fbb | |||
| 51375e8020 | |||
| 65479c14af | |||
| ecee256f91 | |||
| b391448d33 | |||
| 997b30a9c0 | |||
| d193b90ba1 | |||
| ad298ac70c | |||
| 9f34c67bfa | |||
| 5134c5320a | |||
| 7f5949b818 | |||
| 473f89fb57 | |||
| d0ce1726e8 | |||
| c5ec529210 | |||
| 4193a97737 | |||
| ef9c5b6cf1 | |||
| 84e21dc79a | |||
| 1e5f8d6111 | |||
| 3b1bda741b | |||
| 64af24dc40 | |||
| df036e90dc | |||
| 9e80d6b654 | |||
| 39fc9cd22f | |||
| c99e0bd8ee | |||
| 0f12438ab4 | |||
| 7607e971b8 | |||
| da6f3c3855 | |||
| 174b27f31a | |||
| dc11538ae9 | |||
| 9133e18090 | |||
| 616fad2c5d | |||
| 7bb5c1cf84 | |||
| 245e546f18 | |||
| 14aae90954 | |||
| 52d630a2e5 | |||
| b1f294cf2f | |||
| 1591565b1b | |||
| 08d3454683 | |||
| a6c2983f65 | |||
| 3d2b37b26c | |||
| a6d2a2f086 | |||
| 1315ba1315 | |||
| 0417e2c1f1 | |||
| a08670071c | |||
| 338c4457a5 | |||
| f4b5bb814d | |||
| 7ead8ee98a | |||
| 34822bd835 | |||
| 98afb19945 | |||
| d80f2ce811 | |||
| 89afd9143f | |||
| 98e3ecb0ea | |||
| 41208aa2a0 | |||
| a937032eb1 | |||
| 1e8da8c187 | |||
| 1cc31ec26a | |||
| b580cfde8c | |||
| 5886426dac | |||
| eb268922bd | |||
| 049ac53629 | |||
| 00a8ed3da2 | |||
| e998679901 | |||
| 2ccfc4256a | |||
| 11be496a26 | |||
| 66a79574e0 | |||
| ecaf3aedb5 | |||
| 959745b51b | |||
| d349e2742d | |||
| d4531ef2e8 | |||
| 8023da2c1e | |||
| 28896d0b1b | |||
| 47a7d5ae62 | |||
| fe18ace767 | |||
| a1c980ad31 | |||
| a3c9ac3617 | |||
| dfe9c11102 | |||
| e7200fb1b0 | |||
| f67779be9d | |||
| ca160c9627 | |||
| bf2f066029 | |||
| 07633e4e0e | |||
| 0141a6e111 | |||
| 7f85a6436e | |||
| f0eec38655 | |||
| 8cf8d37556 | |||
| 792ad066c9 | |||
| 4b7d08153c | |||
| 46cc813a68 | |||
| 5d52c5b1aa | |||
| dcca5bbea3 | |||
| 125111ee99 | |||
| 8a36472a3d | |||
| 7a664ef0fb | |||
| cfc89e7312 | |||
| 246973ebf6 | |||
| 3a959e55ed | |||
| 925988896a | |||
| 6bc2ffd94b | |||
| 10aa72816d | |||
| ce2aeeff34 | |||
| 6bc7f97375 | |||
| 244cade7d8 | |||
| 996b89aa47 | |||
| b0112dd8ab | |||
| 1b572c5c1d | |||
| 1b442be9ad | |||
| efdd713f62 | |||
| 672c8bebc9 |
128
.gitea/workflows/release.yml
Normal file
128
.gitea/workflows/release.yml
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
name: Build and Release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'Version to release (e.g., v0.1.0)'
|
||||||
|
required: true
|
||||||
|
default: 'v0.1.0'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Rust
|
||||||
|
uses: actions-rs/toolchain@v1
|
||||||
|
with:
|
||||||
|
toolchain: stable
|
||||||
|
profile: minimal
|
||||||
|
override: true
|
||||||
|
|
||||||
|
- name: Install system dependencies
|
||||||
|
run: |
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y pkg-config libssl-dev libzmq3-dev
|
||||||
|
|
||||||
|
- name: Build workspace (static)
|
||||||
|
run: |
|
||||||
|
export RUSTFLAGS="-C target-feature=+crt-static"
|
||||||
|
cargo build --release --workspace --target x86_64-unknown-linux-gnu
|
||||||
|
|
||||||
|
- name: Create release directory
|
||||||
|
run: |
|
||||||
|
mkdir -p release
|
||||||
|
cp target/x86_64-unknown-linux-gnu/release/cm-dashboard release/cm-dashboard-linux-x86_64
|
||||||
|
cp target/x86_64-unknown-linux-gnu/release/cm-dashboard-agent release/cm-dashboard-agent-linux-x86_64
|
||||||
|
|
||||||
|
- name: Create tarball
|
||||||
|
run: |
|
||||||
|
cd release
|
||||||
|
tar -czf cm-dashboard-linux-x86_64.tar.gz cm-dashboard-linux-x86_64 cm-dashboard-agent-linux-x86_64
|
||||||
|
|
||||||
|
- name: Set version variable
|
||||||
|
id: version
|
||||||
|
run: |
|
||||||
|
if [ "${{ gitea.event_name }}" == "workflow_dispatch" ]; then
|
||||||
|
echo "VERSION=${{ gitea.event.inputs.version }}" >> $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Create Release with curl
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
run: |
|
||||||
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
|
|
||||||
|
# Create release
|
||||||
|
curl -X POST \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"tag_name": "'$VERSION'",
|
||||||
|
"name": "cm-dashboard '$VERSION'",
|
||||||
|
"body": "## cm-dashboard '$VERSION'\n\nPre-built binaries for Linux x86_64:\n- cm-dashboard-linux-x86_64 - Dashboard TUI binary\n- cm-dashboard-agent-linux-x86_64 - Agent daemon binary\n- cm-dashboard-linux-x86_64.tar.gz - Combined tarball"
|
||||||
|
}' \
|
||||||
|
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases"
|
||||||
|
|
||||||
|
# Get release ID
|
||||||
|
RELEASE_ID=$(curl -s -H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/tags/$VERSION" | \
|
||||||
|
grep -o '"id":[0-9]*' | head -1 | cut -d':' -f2)
|
||||||
|
|
||||||
|
# Upload binaries
|
||||||
|
curl -X POST \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@release/cm-dashboard-linux-x86_64" \
|
||||||
|
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/$RELEASE_ID/assets?name=cm-dashboard-linux-x86_64"
|
||||||
|
|
||||||
|
curl -X POST \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@release/cm-dashboard-agent-linux-x86_64" \
|
||||||
|
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/$RELEASE_ID/assets?name=cm-dashboard-agent-linux-x86_64"
|
||||||
|
|
||||||
|
curl -X POST \
|
||||||
|
-H "Authorization: token $GITEA_TOKEN" \
|
||||||
|
-F "attachment=@release/cm-dashboard-linux-x86_64.tar.gz" \
|
||||||
|
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/$RELEASE_ID/assets?name=cm-dashboard-linux-x86_64.tar.gz"
|
||||||
|
|
||||||
|
- name: Update NixOS Configuration
|
||||||
|
env:
|
||||||
|
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||||
|
run: |
|
||||||
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
|
|
||||||
|
# Clone nixosbox repository
|
||||||
|
git clone https://$GITEA_TOKEN@gitea.cmtec.se/cm/nixosbox.git nixosbox-update
|
||||||
|
cd nixosbox-update
|
||||||
|
|
||||||
|
# Get hash for the new release tarball
|
||||||
|
TARBALL_URL="https://gitea.cmtec.se/cm/cm-dashboard/releases/download/$VERSION/cm-dashboard-linux-x86_64.tar.gz"
|
||||||
|
|
||||||
|
# Download tarball to get correct hash
|
||||||
|
curl -L -o cm-dashboard.tar.gz "$TARBALL_URL"
|
||||||
|
# Convert sha256 hex to base64 for Nix hash format using Python
|
||||||
|
NEW_HASH=$(sha256sum cm-dashboard.tar.gz | cut -d' ' -f1)
|
||||||
|
NIX_HASH="sha256-$(python3 -c "import base64, binascii; print(base64.b64encode(binascii.unhexlify('$NEW_HASH')).decode())")"
|
||||||
|
|
||||||
|
# Update the NixOS configuration
|
||||||
|
sed -i "s|version = \"v[^\"]*\"|version = \"$VERSION\"|" hosts/common/cm-dashboard.nix
|
||||||
|
sed -i "s|sha256 = \"sha256-[^\"]*\"|sha256 = \"$NIX_HASH\"|" hosts/common/cm-dashboard.nix
|
||||||
|
|
||||||
|
# Commit and push changes
|
||||||
|
git config user.name "Gitea Actions"
|
||||||
|
git config user.email "actions@gitea.cmtec.se"
|
||||||
|
git add hosts/common/cm-dashboard.nix
|
||||||
|
git commit -m "Auto-update cm-dashboard to $VERSION
|
||||||
|
|
||||||
|
- Update version to $VERSION with automated release
|
||||||
|
- Update tarball hash for new static binaries
|
||||||
|
- Automated update from cm-dashboard release workflow"
|
||||||
|
git push
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
/target
|
/target
|
||||||
logs/
|
logs/
|
||||||
|
backup/legacy-2025-10-16
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
# Agent Guide
|
|
||||||
|
|
||||||
Agents working in this repo must follow the instructions in `CLAUDE.md`.
|
|
||||||
483
CLAUDE.md
483
CLAUDE.md
@@ -2,347 +2,147 @@
|
|||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for our specific monitoring needs and API integrations.
|
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built with ZMQ-based metric collection and individual metrics architecture.
|
||||||
|
|
||||||
## Project Goals
|
## Current Features
|
||||||
|
|
||||||
### Core Objectives
|
### Core Functionality
|
||||||
|
- **Real-time Monitoring**: CPU, RAM, Storage, and Service status
|
||||||
|
- **Service Management**: Start/stop services with user-stopped tracking
|
||||||
|
- **Multi-host Support**: Monitor multiple servers from single dashboard
|
||||||
|
- **NixOS Integration**: System rebuild via SSH + tmux popup
|
||||||
|
- **Backup Monitoring**: Borgbackup status and scheduling
|
||||||
|
|
||||||
- **Real-time monitoring** of all infrastructure components
|
### User-Stopped Service Tracking
|
||||||
- **Multi-host support** for cmbox, labbox, simonbox, steambox, srv01
|
- Services stopped via dashboard are marked as "user-stopped"
|
||||||
- **Performance-focused** with minimal resource usage
|
- User-stopped services report Status::OK instead of Warning
|
||||||
- **Keyboard-driven interface** for power users
|
- Prevents false alerts during intentional maintenance
|
||||||
- **Integration** with existing monitoring APIs (ports 6127, 6128, 6129)
|
- Persistent storage survives agent restarts
|
||||||
|
- Automatic flag clearing when services are restarted via dashboard
|
||||||
### Key Features
|
|
||||||
|
|
||||||
- **NVMe health monitoring** with wear prediction
|
|
||||||
- **CPU / memory / GPU telemetry** with automatic thresholding
|
|
||||||
- **Service resource monitoring** with per-service CPU and RAM usage
|
|
||||||
- **Disk usage overview** for root filesystems
|
|
||||||
- **Backup status** with detailed metrics and history
|
|
||||||
- **Unified alert pipeline** summarising host health
|
|
||||||
- **Historical data tracking** and trend analysis
|
|
||||||
|
|
||||||
## Technical Architecture
|
|
||||||
|
|
||||||
### Technology Stack
|
|
||||||
|
|
||||||
- **Language**: Rust 🦀
|
|
||||||
- **TUI Framework**: ratatui (modern tui-rs fork)
|
|
||||||
- **Async Runtime**: tokio
|
|
||||||
- **HTTP Client**: reqwest
|
|
||||||
- **Serialization**: serde
|
|
||||||
- **CLI**: clap
|
|
||||||
- **Error Handling**: anyhow
|
|
||||||
- **Time**: chrono
|
|
||||||
|
|
||||||
### Dependencies
|
|
||||||
|
|
||||||
|
### Custom Service Logs
|
||||||
|
- Configure service-specific log file paths per host in dashboard config
|
||||||
|
- Press `L` on any service to view custom log files via `tail -f`
|
||||||
|
- Configuration format in dashboard config:
|
||||||
```toml
|
```toml
|
||||||
[dependencies]
|
[service_logs]
|
||||||
ratatui = "0.24" # Modern TUI framework
|
hostname1 = [
|
||||||
crossterm = "0.27" # Cross-platform terminal handling
|
{ service_name = "nginx", log_file_path = "/var/log/nginx/access.log" },
|
||||||
tokio = { version = "1.0", features = ["full"] } # Async runtime
|
{ service_name = "app", log_file_path = "/var/log/myapp/app.log" }
|
||||||
reqwest = { version = "0.11", features = ["json"] } # HTTP client
|
]
|
||||||
serde = { version = "1.0", features = ["derive"] } # JSON parsing
|
hostname2 = [
|
||||||
clap = { version = "4.0", features = ["derive"] } # CLI args
|
{ service_name = "database", log_file_path = "/var/log/postgres/postgres.log" }
|
||||||
anyhow = "1.0" # Error handling
|
]
|
||||||
chrono = "0.4" # Time handling
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Project Structure
|
### Service Management
|
||||||
|
- **Direct Control**: Arrow keys (↑↓) or vim keys (j/k) navigate services
|
||||||
|
- **Service Actions**:
|
||||||
|
- `s` - Start service (sends UserStart command)
|
||||||
|
- `S` - Stop service (sends UserStop command)
|
||||||
|
- `J` - Show service logs (journalctl in tmux popup)
|
||||||
|
- `L` - Show custom log files (tail -f custom paths in tmux popup)
|
||||||
|
- `R` - Rebuild current host
|
||||||
|
- **Visual Status**: Green ● (active), Yellow ◐ (inactive), Red ◯ (failed)
|
||||||
|
- **Transitional Icons**: Blue arrows during operations
|
||||||
|
|
||||||
```
|
### Navigation
|
||||||
cm-dashboard/
|
- **Tab**: Switch between hosts
|
||||||
├── Cargo.toml
|
- **↑↓ or j/k**: Select services
|
||||||
├── README.md
|
- **J**: Show service logs (journalctl)
|
||||||
├── CLAUDE.md # This file
|
- **L**: Show custom log files
|
||||||
├── src/
|
- **q**: Quit dashboard
|
||||||
│ ├── main.rs # Entry point & CLI
|
|
||||||
│ ├── app.rs # Main application state
|
|
||||||
│ ├── ui/
|
|
||||||
│ │ ├── mod.rs
|
|
||||||
│ │ ├── dashboard.rs # Main dashboard layout
|
|
||||||
│ │ ├── nvme.rs # NVMe health widget
|
|
||||||
│ │ ├── services.rs # Services status widget
|
|
||||||
│ │ ├── memory.rs # RAM optimization widget
|
|
||||||
│ │ ├── backup.rs # Backup status widget
|
|
||||||
│ │ └── alerts.rs # Alerts/notifications widget
|
|
||||||
│ ├── api/
|
|
||||||
│ │ ├── mod.rs
|
|
||||||
│ │ ├── client.rs # HTTP client wrapper
|
|
||||||
│ │ ├── smart.rs # Smart metrics API (port 6127)
|
|
||||||
│ │ ├── service.rs # Service metrics API (port 6128)
|
|
||||||
│ │ └── backup.rs # Backup metrics API (port 6129)
|
|
||||||
│ ├── data/
|
|
||||||
│ │ ├── mod.rs
|
|
||||||
│ │ ├── metrics.rs # Data structures
|
|
||||||
│ │ ├── history.rs # Historical data storage
|
|
||||||
│ │ └── config.rs # Host configuration
|
|
||||||
│ └── config.rs # Application configuration
|
|
||||||
├── config/
|
|
||||||
│ ├── hosts.toml # Host definitions
|
|
||||||
│ └── dashboard.toml # Dashboard layout config
|
|
||||||
└── docs/
|
|
||||||
├── API.md # API integration documentation
|
|
||||||
└── WIDGETS.md # Widget development guide
|
|
||||||
```
|
|
||||||
|
|
||||||
### Data Structures
|
## Core Architecture Principles
|
||||||
|
|
||||||
```rust
|
### Individual Metrics Philosophy
|
||||||
#[derive(Deserialize, Debug)]
|
- Agent collects individual metrics, dashboard composes widgets
|
||||||
pub struct SmartMetrics {
|
- Each metric collected, transmitted, and stored individually
|
||||||
pub status: String,
|
- Agent calculates status for each metric using thresholds
|
||||||
pub drives: Vec<DriveInfo>,
|
- Dashboard aggregates individual metric statuses for widget status
|
||||||
pub summary: DriveSummary,
|
|
||||||
pub issues: Vec<String>,
|
|
||||||
pub timestamp: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
|
||||||
pub struct ServiceMetrics {
|
|
||||||
pub summary: ServiceSummary,
|
|
||||||
pub services: Vec<ServiceInfo>,
|
|
||||||
pub timestamp: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
|
||||||
pub struct ServiceSummary {
|
|
||||||
pub healthy: usize,
|
|
||||||
pub degraded: usize,
|
|
||||||
pub failed: usize,
|
|
||||||
pub memory_used_mb: f32,
|
|
||||||
pub memory_quota_mb: f32,
|
|
||||||
pub system_memory_used_mb: f32,
|
|
||||||
pub system_memory_total_mb: f32,
|
|
||||||
pub disk_used_gb: f32,
|
|
||||||
pub disk_total_gb: f32,
|
|
||||||
pub cpu_load_1: f32,
|
|
||||||
pub cpu_load_5: f32,
|
|
||||||
pub cpu_load_15: f32,
|
|
||||||
pub cpu_freq_mhz: Option<f32>,
|
|
||||||
pub cpu_temp_c: Option<f32>,
|
|
||||||
pub gpu_load_percent: Option<f32>,
|
|
||||||
pub gpu_temp_c: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
|
||||||
pub struct BackupMetrics {
|
|
||||||
pub overall_status: String,
|
|
||||||
pub backup: BackupInfo,
|
|
||||||
pub service: BackupServiceInfo,
|
|
||||||
pub timestamp: u64,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Dashboard Layout Design
|
|
||||||
|
|
||||||
### Main Dashboard View
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ CM Dashboard • cmbox │
|
|
||||||
├─────────────────────────────────────────────────────────────────────┤
|
|
||||||
│ Storage • ok:1 warn:0 crit:0 │ Services • ok:1 warn:0 fail:0 │
|
|
||||||
│ ┌─────────────────────────────────┐ │ ┌─────────────────────────────── │ │
|
|
||||||
│ │Drive Temp Wear Spare Hours │ │ │Service memory: 7.1/23899.7 MiB│ │
|
|
||||||
│ │nvme0n1 28°C 1% 100% 14489 │ │ │Disk usage: — │ │
|
|
||||||
│ │ Capacity Usage │ │ │ Service Memory Disk │ │
|
|
||||||
│ │ 954G 77G (8%) │ │ │✔ sshd 7.1 MiB — │ │
|
|
||||||
│ └─────────────────────────────────┘ │ └─────────────────────────────── │ │
|
|
||||||
├─────────────────────────────────────────────────────────────────────┤
|
|
||||||
│ CPU / Memory • warn │ Backups │
|
|
||||||
│ System memory: 5251.7/23899.7 MiB │ Host cmbox awaiting backup │ │
|
|
||||||
│ CPU load (1/5/15): 2.18 2.66 2.56 │ metrics │ │
|
|
||||||
│ CPU freq: 1100.1 MHz │ │ │
|
|
||||||
│ CPU temp: 47.0°C │ │ │
|
|
||||||
├─────────────────────────────────────────────────────────────────────┤
|
|
||||||
│ Alerts • ok:0 warn:3 fail:0 │ Status • ZMQ connected │
|
|
||||||
│ cmbox: warning: CPU load 2.18 │ Monitoring • hosts: 3 │ │
|
|
||||||
│ srv01: pending: awaiting metrics │ Data source: ZMQ – connected │ │
|
|
||||||
│ labbox: pending: awaiting metrics │ Active host: cmbox (1/3) │ │
|
|
||||||
└─────────────────────────────────────────────────────────────────────┘
|
|
||||||
Keys: [←→] hosts [r]efresh [q]uit
|
|
||||||
```
|
|
||||||
|
|
||||||
### Multi-Host View
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ 🖥️ CMTEC Host Overview │
|
|
||||||
├─────────────────────────────────────────────────────────────────────┤
|
|
||||||
│ Host │ NVMe Wear │ RAM Usage │ Services │ Last Alert │
|
|
||||||
├─────────────────────────────────────────────────────────────────────┤
|
|
||||||
│ srv01 │ 4% ✅ │ 32% ✅ │ 8/8 ✅ │ 04:00 Backup OK │
|
|
||||||
│ cmbox │ 12% ✅ │ 45% ✅ │ 3/3 ✅ │ Yesterday Email test │
|
|
||||||
│ labbox │ 8% ✅ │ 28% ✅ │ 2/2 ✅ │ 2h ago NVMe temp OK │
|
|
||||||
│ simonbox │ 15% ✅ │ 67% ⚠️ │ 4/4 ✅ │ Gaming session active │
|
|
||||||
│ steambox │ 23% ✅ │ 78% ⚠️ │ 2/2 ✅ │ High RAM usage │
|
|
||||||
└─────────────────────────────────────────────────────────────────────┘
|
|
||||||
Keys: [Enter] details [r]efresh [s]ort [f]ilter [q]uit
|
|
||||||
```
|
|
||||||
|
|
||||||
## Architecture Principles - CRITICAL
|
|
||||||
|
|
||||||
### Agent-Dashboard Separation of Concerns
|
|
||||||
|
|
||||||
**AGENT IS SINGLE SOURCE OF TRUTH FOR ALL STATUS CALCULATIONS**
|
|
||||||
- Agent calculates status ("ok"/"warning"/"critical"/"unknown") using defined thresholds
|
|
||||||
- Agent sends status to dashboard via ZMQ
|
|
||||||
- Dashboard NEVER calculates status - only displays what agent provides
|
|
||||||
|
|
||||||
**Data Flow Architecture:**
|
|
||||||
```
|
|
||||||
Agent (calculations + thresholds) → Status → Dashboard (display only) → TableBuilder (colors)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Status Handling Rules:**
|
|
||||||
- Agent provides status → Dashboard uses agent status
|
|
||||||
- Agent doesn't provide status → Dashboard shows "unknown" (NOT "ok")
|
|
||||||
- Dashboard widgets NEVER contain hardcoded thresholds
|
|
||||||
- TableBuilder converts status to colors for display
|
|
||||||
|
|
||||||
### Current Agent Thresholds (as of 2025-10-12)
|
|
||||||
|
|
||||||
**CPU Load (service.rs:392-400):**
|
|
||||||
- Warning: ≥ 2.0 (testing value, was 5.0)
|
|
||||||
- Critical: ≥ 4.0 (testing value, was 8.0)
|
|
||||||
|
|
||||||
**CPU Temperature (service.rs:412-420):**
|
|
||||||
- Warning: ≥ 70.0°C
|
|
||||||
- Critical: ≥ 80.0°C
|
|
||||||
|
|
||||||
**Memory Usage (service.rs:402-410):**
|
|
||||||
- Warning: ≥ 80%
|
|
||||||
- Critical: ≥ 95%
|
|
||||||
|
|
||||||
### Email Notifications
|
|
||||||
|
|
||||||
**System Configuration:**
|
|
||||||
- From: `{hostname}@cmtec.se` (e.g., cmbox@cmtec.se)
|
|
||||||
- To: `cm@cmtec.se`
|
|
||||||
- SMTP: localhost:25 (postfix)
|
|
||||||
- Timezone: Europe/Stockholm (not UTC)
|
|
||||||
|
|
||||||
**Notification Triggers:**
|
|
||||||
- Status degradation: any → "warning" or "critical"
|
|
||||||
- Recovery: "warning"/"critical" → "ok"
|
|
||||||
- Rate limiting: configurable (set to 0 for testing, 30 minutes for production)
|
|
||||||
|
|
||||||
**Monitored Components:**
|
|
||||||
- system.cpu (load status) - SystemCollector
|
|
||||||
- system.memory (usage status) - SystemCollector
|
|
||||||
- system.cpu_temp (temperature status) - SystemCollector (disabled)
|
|
||||||
- system.services (service health status) - ServiceCollector
|
|
||||||
- storage.smart (drive health) - SmartCollector
|
|
||||||
- backup.overall (backup status) - BackupCollector
|
|
||||||
|
|
||||||
### Pure Auto-Discovery Implementation
|
|
||||||
|
|
||||||
**Agent Configuration:**
|
|
||||||
- No config files required
|
|
||||||
- Auto-detects storage devices, services, backup systems
|
|
||||||
- Runtime discovery of system capabilities
|
|
||||||
- CLI: `cm-dashboard-agent [-v]` (only verbose flag)
|
|
||||||
|
|
||||||
**Service Discovery:**
|
|
||||||
- Scans running systemd services
|
|
||||||
- Filters by predefined interesting patterns (gitea, nginx, docker, etc.)
|
|
||||||
- No host-specific hardcoded service lists
|
|
||||||
|
|
||||||
### Current Implementation Status
|
|
||||||
|
|
||||||
**Completed:**
|
|
||||||
- [x] Pure auto-discovery agent (no config files)
|
|
||||||
- [x] Agent-side status calculations with defined thresholds
|
|
||||||
- [x] Dashboard displays agent status (no dashboard calculations)
|
|
||||||
- [x] Email notifications with Stockholm timezone
|
|
||||||
- [x] CPU temperature monitoring and notifications
|
|
||||||
- [x] ZMQ message format standardization
|
|
||||||
- [x] Removed all hardcoded dashboard thresholds
|
|
||||||
- [x] CPU thresholds restored to production values (5.0/8.0)
|
|
||||||
- [x] All collectors output standardized status strings (ok/warning/critical/unknown)
|
|
||||||
- [x] Dashboard connection loss detection with 5-second keep-alive
|
|
||||||
- [x] Removed excessive logging from agent
|
|
||||||
- [x] Fixed all compiler warnings in both agent and dashboard
|
|
||||||
- [x] **SystemCollector architecture refactoring completed (2025-10-12)**
|
|
||||||
- [x] Created SystemCollector for CPU load, memory, temperature, C-states
|
|
||||||
- [x] Moved system metrics from ServiceCollector to SystemCollector
|
|
||||||
- [x] Updated dashboard to parse and display SystemCollector data
|
|
||||||
- [x] Enhanced service notifications to include specific failure details
|
|
||||||
- [x] CPU temperature thresholds set to 100°C (effectively disabled)
|
|
||||||
- [x] **SystemCollector bug fixes completed (2025-10-12)**
|
|
||||||
- [x] Fixed CPU load parsing for comma decimal separator locale (", " split)
|
|
||||||
- [x] Fixed CPU temperature to prioritize x86_pkg_temp over generic thermal zones
|
|
||||||
- [x] Fixed C-state collection to discover all available states (including C10)
|
|
||||||
- [x] **Dashboard improvements and maintenance mode (2025-10-13)**
|
|
||||||
- [x] Host auto-discovery with predefined CMTEC infrastructure hosts (cmbox, labbox, simonbox, steambox, srv01)
|
|
||||||
- [x] Host navigation limited to connected hosts only (no disconnected host cycling)
|
|
||||||
- [x] Storage widget restructured: Name/Temp/Wear/Usage columns with SMART details as descriptions
|
|
||||||
- [x] Agent-provided descriptions for Storage widget (agent is source of truth for formatting)
|
|
||||||
- [x] Maintenance mode implementation: /tmp/cm-maintenance file suppresses notifications
|
|
||||||
- [x] NixOS borgbackup integration with automatic maintenance mode during backups
|
|
||||||
- [x] System widget simplified to single row with C-states as description lines
|
|
||||||
- [x] CPU load thresholds updated to production values (9.0/10.0)
|
|
||||||
|
|
||||||
**Production Configuration:**
|
|
||||||
- CPU load thresholds: Warning ≥ 9.0, Critical ≥ 10.0
|
|
||||||
- CPU temperature thresholds: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled)
|
|
||||||
- Memory usage thresholds: Warning ≥ 80%, Critical ≥ 95%
|
|
||||||
- Connection timeout: 15 seconds (agents send data every 5 seconds)
|
|
||||||
- Email rate limiting: 30 minutes (set to 0 for testing)
|
|
||||||
|
|
||||||
### Maintenance Mode
|
### Maintenance Mode
|
||||||
|
|
||||||
**Purpose:**
|
|
||||||
- Suppress email notifications during planned maintenance or backups
|
|
||||||
- Prevents false alerts when services are intentionally stopped
|
|
||||||
|
|
||||||
**Implementation:**
|
|
||||||
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
||||||
- File presence suppresses all email notifications while continuing monitoring
|
- File presence suppresses all email notifications while continuing monitoring
|
||||||
- Dashboard continues to show real status, only notifications are blocked
|
- Dashboard continues to show real status, only notifications are blocked
|
||||||
|
|
||||||
**Usage:**
|
Usage:
|
||||||
```bash
|
```bash
|
||||||
# Enable maintenance mode
|
# Enable maintenance mode
|
||||||
touch /tmp/cm-maintenance
|
touch /tmp/cm-maintenance
|
||||||
|
|
||||||
# Run maintenance tasks (backups, service restarts, etc.)
|
# Run maintenance tasks
|
||||||
systemctl stop service
|
systemctl stop service
|
||||||
# ... maintenance work ...
|
# ... maintenance work ...
|
||||||
systemctl start service
|
systemctl start service
|
||||||
|
|
||||||
# Disable maintenance mode
|
# Disable maintenance mode
|
||||||
rm /tmp/cm-maintenance
|
rm /tmp/cm-maintenance
|
||||||
```
|
```
|
||||||
|
|
||||||
**NixOS Integration:**
|
## Development and Deployment Architecture
|
||||||
- Borgbackup script automatically creates/removes maintenance file
|
|
||||||
- Automatic cleanup via trap ensures maintenance mode doesn't stick
|
|
||||||
|
|
||||||
### Development Guidelines
|
### Development Path
|
||||||
|
- **Location:** `~/projects/cm-dashboard`
|
||||||
|
- **Purpose:** Development workflow only - for committing new code
|
||||||
|
- **Access:** Only for developers to commit changes
|
||||||
|
|
||||||
**When Adding New Metrics:**
|
### Deployment Path
|
||||||
1. Agent calculates status with thresholds
|
- **Location:** `/var/lib/cm-dashboard/nixos-config`
|
||||||
2. Agent adds `{metric}_status` field to JSON output
|
- **Purpose:** Production deployment only - agent clones/pulls from git
|
||||||
3. Dashboard data structure adds `{metric}_status: Option<String>`
|
- **Workflow:** git pull → `/var/lib/cm-dashboard/nixos-config` → nixos-rebuild
|
||||||
4. Dashboard uses `status_level_from_agent_status()` for display
|
|
||||||
5. Agent adds notification monitoring for status changes
|
|
||||||
|
|
||||||
**NEVER:**
|
### Git Flow
|
||||||
- Add hardcoded thresholds to dashboard widgets
|
```
|
||||||
- Calculate status in dashboard with different thresholds than agent
|
Development: ~/projects/cm-dashboard → git commit → git push
|
||||||
- Use "ok" as default when agent status is missing (use "unknown")
|
Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||||
- Calculate colors in widgets (TableBuilder's responsibility)
|
```
|
||||||
|
|
||||||
# Important Communication Guidelines
|
## Automated Binary Release System
|
||||||
|
|
||||||
NEVER write that you have "successfully implemented" something or generate extensive summary text without first verifying with the user that the implementation is correct. This wastes tokens. Keep responses concise.
|
CM Dashboard uses automated binary releases instead of source builds.
|
||||||
|
|
||||||
NEVER implement code without first getting explicit user agreement on the approach. Always ask for confirmation before proceeding with implementation.
|
### Creating New Releases
|
||||||
|
```bash
|
||||||
|
cd ~/projects/cm-dashboard
|
||||||
|
git tag v0.1.X
|
||||||
|
git push origin v0.1.X
|
||||||
|
```
|
||||||
|
|
||||||
|
This automatically:
|
||||||
|
- Builds static binaries with `RUSTFLAGS="-C target-feature=+crt-static"`
|
||||||
|
- Creates GitHub-style release with tarball
|
||||||
|
- Uploads binaries via Gitea API
|
||||||
|
|
||||||
|
### NixOS Configuration Updates
|
||||||
|
Edit `~/projects/nixosbox/hosts/common/cm-dashboard.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
version = "v0.1.X";
|
||||||
|
src = pkgs.fetchurl {
|
||||||
|
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/${version}/cm-dashboard-linux-x86_64.tar.gz";
|
||||||
|
sha256 = "sha256-NEW_HASH_HERE";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Release Hash
|
||||||
|
```bash
|
||||||
|
cd ~/projects/nixosbox
|
||||||
|
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||||
|
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/v0.1.X/cm-dashboard-linux-x86_64.tar.gz";
|
||||||
|
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||||
|
}' 2>&1 | grep "got:"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building
|
||||||
|
|
||||||
|
**Testing & Building:**
|
||||||
|
- **Workspace builds**: `nix-shell -p openssl pkg-config --run "cargo build --workspace"`
|
||||||
|
- **Clean compilation**: Remove `target/` between major changes
|
||||||
|
|
||||||
|
## Important Communication Guidelines
|
||||||
|
|
||||||
|
Keep responses concise and focused. Avoid extensive implementation summaries unless requested.
|
||||||
|
|
||||||
## Commit Message Guidelines
|
## Commit Message Guidelines
|
||||||
|
|
||||||
@@ -358,62 +158,29 @@ NEVER implement code without first getting explicit user agreement on the approa
|
|||||||
- Write from the perspective of a human developer
|
- Write from the perspective of a human developer
|
||||||
|
|
||||||
**Examples:**
|
**Examples:**
|
||||||
- ❌ "Generated with Claude Code"
|
- ❌ "Generated with Claude Code"
|
||||||
- ❌ "AI-assisted implementation"
|
- ❌ "AI-assisted implementation"
|
||||||
- ❌ "Automated refactoring"
|
- ❌ "Automated refactoring"
|
||||||
- ✅ "Implement maintenance mode for backup operations"
|
- ✅ "Implement maintenance mode for backup operations"
|
||||||
- ✅ "Restructure storage widget with improved layout"
|
- ✅ "Restructure storage widget with improved layout"
|
||||||
- ✅ "Update CPU thresholds to production values"
|
- ✅ "Update CPU thresholds to production values"
|
||||||
|
|
||||||
## NixOS Configuration Updates
|
## Implementation Rules
|
||||||
|
|
||||||
When code changes are made to cm-dashboard, the NixOS configuration at `~/nixosbox` must be updated to deploy the changes.
|
1. **Individual Metrics**: Each metric is collected, transmitted, and stored individually
|
||||||
|
2. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
||||||
|
3. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
||||||
|
4. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
||||||
|
|
||||||
### Update Process
|
**NEVER:**
|
||||||
|
- Copy/paste ANY code from legacy implementations
|
||||||
|
- Calculate status in dashboard widgets
|
||||||
|
- Hardcode metric names in widgets (use const arrays)
|
||||||
|
- Create files unless absolutely necessary for achieving goals
|
||||||
|
- Create documentation files unless explicitly requested
|
||||||
|
|
||||||
1. **Get Latest Commit Hash**
|
**ALWAYS:**
|
||||||
```bash
|
- Prefer editing existing files to creating new ones
|
||||||
git log -1 --format="%H"
|
- Follow existing code conventions and patterns
|
||||||
```
|
- Use existing libraries and utilities
|
||||||
|
- Follow security best practices
|
||||||
2. **Update NixOS Configuration**
|
|
||||||
Edit `~/nixosbox/hosts/common/cm-dashboard.nix`:
|
|
||||||
```nix
|
|
||||||
src = pkgs.fetchgit {
|
|
||||||
url = "https://gitea.cmtec.se/cm/cm-dashboard.git";
|
|
||||||
rev = "NEW_COMMIT_HASH_HERE";
|
|
||||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="; # Placeholder
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Get Correct Source Hash**
|
|
||||||
Build with placeholder hash to get the actual hash:
|
|
||||||
```bash
|
|
||||||
cd ~/nixosbox
|
|
||||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchgit {
|
|
||||||
url = "https://gitea.cmtec.se/cm/cm-dashboard.git";
|
|
||||||
rev = "NEW_COMMIT_HASH";
|
|
||||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
|
||||||
}' 2>&1 | grep "got:"
|
|
||||||
```
|
|
||||||
|
|
||||||
Example output:
|
|
||||||
```
|
|
||||||
error: hash mismatch in fixed-output derivation '/nix/store/...':
|
|
||||||
specified: sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=
|
|
||||||
got: sha256-x8crxNusOUYRrkP9mYEOG+Ga3JCPIdJLkEAc5P1ZxdQ=
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Update Configuration with Correct Hash**
|
|
||||||
Replace the placeholder with the hash from the error message (the "got:" line).
|
|
||||||
|
|
||||||
5. **Commit NixOS Configuration**
|
|
||||||
```bash
|
|
||||||
cd ~/nixosbox
|
|
||||||
git add hosts/common/cm-dashboard.nix
|
|
||||||
git commit -m "Update cm-dashboard to latest version (SHORT_HASH)"
|
|
||||||
git push
|
|
||||||
```
|
|
||||||
|
|
||||||
6. **Rebuild System**
|
|
||||||
The user handles the system rebuild step - this cannot be automated.
|
|
||||||
220
Cargo.lock
generated
220
Cargo.lock
generated
@@ -132,9 +132,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "2.9.4"
|
version = "2.10.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
|
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bumpalo"
|
name = "bumpalo"
|
||||||
@@ -178,9 +178,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.3"
|
version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
|
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "chrono"
|
name = "chrono"
|
||||||
@@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cm-dashboard"
|
name = "cm-dashboard"
|
||||||
version = "0.1.0"
|
version = "0.1.49"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"chrono",
|
"chrono",
|
||||||
@@ -281,17 +281,17 @@ dependencies = [
|
|||||||
"ratatui",
|
"ratatui",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"toml",
|
"toml",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-appender",
|
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"zmq",
|
"zmq",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cm-dashboard-agent"
|
name = "cm-dashboard-agent"
|
||||||
version = "0.1.0"
|
version = "0.1.49"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
@@ -299,28 +299,27 @@ dependencies = [
|
|||||||
"chrono-tz",
|
"chrono-tz",
|
||||||
"clap",
|
"clap",
|
||||||
"cm-dashboard-shared",
|
"cm-dashboard-shared",
|
||||||
"futures",
|
|
||||||
"gethostname",
|
"gethostname",
|
||||||
"lettre",
|
"lettre",
|
||||||
"rand",
|
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"toml",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-appender",
|
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"zmq",
|
"zmq",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cm-dashboard-shared"
|
name = "cm-dashboard-shared"
|
||||||
version = "0.1.0"
|
version = "0.1.49"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"chrono",
|
"chrono",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"thiserror",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -407,7 +406,7 @@ version = "0.27.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
|
checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.9.4",
|
"bitflags 2.10.0",
|
||||||
"crossterm_winapi",
|
"crossterm_winapi",
|
||||||
"libc",
|
"libc",
|
||||||
"mio 0.8.11",
|
"mio 0.8.11",
|
||||||
@@ -426,15 +425,6 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "deranged"
|
|
||||||
version = "0.5.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071"
|
|
||||||
dependencies = [
|
|
||||||
"powerfmt",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dircpy"
|
name = "dircpy"
|
||||||
version = "0.3.19"
|
version = "0.3.19"
|
||||||
@@ -552,21 +542,6 @@ dependencies = [
|
|||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "futures"
|
|
||||||
version = "0.3.31"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
|
|
||||||
dependencies = [
|
|
||||||
"futures-channel",
|
|
||||||
"futures-core",
|
|
||||||
"futures-executor",
|
|
||||||
"futures-io",
|
|
||||||
"futures-sink",
|
|
||||||
"futures-task",
|
|
||||||
"futures-util",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures-channel"
|
name = "futures-channel"
|
||||||
version = "0.3.31"
|
version = "0.3.31"
|
||||||
@@ -574,7 +549,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
|
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"futures-sink",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -583,34 +557,12 @@ version = "0.3.31"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "futures-executor"
|
|
||||||
version = "0.3.31"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
|
|
||||||
dependencies = [
|
|
||||||
"futures-core",
|
|
||||||
"futures-task",
|
|
||||||
"futures-util",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures-io"
|
name = "futures-io"
|
||||||
version = "0.3.31"
|
version = "0.3.31"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
|
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "futures-macro"
|
|
||||||
version = "0.3.31"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures-sink"
|
name = "futures-sink"
|
||||||
version = "0.3.31"
|
version = "0.3.31"
|
||||||
@@ -629,11 +581,8 @@ version = "0.3.31"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
|
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-channel",
|
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"futures-io",
|
"futures-io",
|
||||||
"futures-macro",
|
|
||||||
"futures-sink",
|
|
||||||
"futures-task",
|
"futures-task",
|
||||||
"memchr",
|
"memchr",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
@@ -653,25 +602,14 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.2.16"
|
version = "0.3.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
|
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"libc",
|
|
||||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "getrandom"
|
|
||||||
version = "0.3.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"libc",
|
"libc",
|
||||||
"r-efi",
|
"r-efi",
|
||||||
"wasi 0.14.7+wasi-0.2.4",
|
"wasip2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -936,9 +874,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "indexmap"
|
name = "indexmap"
|
||||||
version = "2.11.4"
|
version = "2.12.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5"
|
checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"equivalent",
|
"equivalent",
|
||||||
"hashbrown 0.16.0",
|
"hashbrown 0.16.0",
|
||||||
@@ -983,7 +921,7 @@ version = "0.1.34"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.3.3",
|
"getrandom",
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1106,19 +1044,19 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
"wasi",
|
||||||
"windows-sys 0.48.0",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mio"
|
name = "mio"
|
||||||
version = "1.0.4"
|
version = "1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
|
checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
"wasi",
|
||||||
"windows-sys 0.59.0",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1156,12 +1094,6 @@ dependencies = [
|
|||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "num-conv"
|
|
||||||
version = "0.1.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-traits"
|
name = "num-traits"
|
||||||
version = "0.2.19"
|
version = "0.2.19"
|
||||||
@@ -1185,11 +1117,11 @@ checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openssl"
|
name = "openssl"
|
||||||
version = "0.10.73"
|
version = "0.10.74"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8"
|
checksum = "24ad14dd45412269e1a30f52ad8f0664f0f4f4a89ee8fe28c3b3527021ebb654"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.9.4",
|
"bitflags 2.10.0",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"foreign-types",
|
"foreign-types",
|
||||||
"libc",
|
"libc",
|
||||||
@@ -1217,9 +1149,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openssl-sys"
|
name = "openssl-sys"
|
||||||
version = "0.9.109"
|
version = "0.9.110"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571"
|
checksum = "0a9f0075ba3c21b09f8e8b2026584b1d18d49388648f2fbbf3c97ea8deced8e2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"libc",
|
"libc",
|
||||||
@@ -1336,21 +1268,6 @@ dependencies = [
|
|||||||
"zerovec",
|
"zerovec",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "powerfmt"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ppv-lite86"
|
|
||||||
version = "0.2.21"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
|
|
||||||
dependencies = [
|
|
||||||
"zerocopy",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.101"
|
version = "1.0.101"
|
||||||
@@ -1396,18 +1313,6 @@ version = "0.8.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
|
||||||
"rand_chacha",
|
|
||||||
"rand_core",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rand_chacha"
|
|
||||||
version = "0.3.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
|
||||||
dependencies = [
|
|
||||||
"ppv-lite86",
|
|
||||||
"rand_core",
|
"rand_core",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1416,9 +1321,6 @@ name = "rand_core"
|
|||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||||
dependencies = [
|
|
||||||
"getrandom 0.2.16",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ratatui"
|
name = "ratatui"
|
||||||
@@ -1426,7 +1328,7 @@ version = "0.24.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0ebc917cfb527a566c37ecb94c7e3fd098353516fb4eb6bea17015ade0182425"
|
checksum = "0ebc917cfb527a566c37ecb94c7e3fd098353516fb4eb6bea17015ade0182425"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.9.4",
|
"bitflags 2.10.0",
|
||||||
"cassowary",
|
"cassowary",
|
||||||
"crossterm",
|
"crossterm",
|
||||||
"indoc",
|
"indoc",
|
||||||
@@ -1464,7 +1366,7 @@ version = "0.5.18"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.9.4",
|
"bitflags 2.10.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1542,7 +1444,7 @@ version = "1.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
|
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.9.4",
|
"bitflags 2.10.0",
|
||||||
"errno",
|
"errno",
|
||||||
"libc",
|
"libc",
|
||||||
"linux-raw-sys",
|
"linux-raw-sys",
|
||||||
@@ -1600,7 +1502,7 @@ version = "2.11.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
|
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.9.4",
|
"bitflags 2.10.0",
|
||||||
"core-foundation",
|
"core-foundation",
|
||||||
"core-foundation-sys",
|
"core-foundation-sys",
|
||||||
"libc",
|
"libc",
|
||||||
@@ -1813,9 +1715,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "2.0.106"
|
version = "2.0.107"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
|
checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
@@ -1886,7 +1788,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
|
checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fastrand",
|
"fastrand",
|
||||||
"getrandom 0.3.3",
|
"getrandom",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"rustix",
|
"rustix",
|
||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
@@ -1921,37 +1823,6 @@ dependencies = [
|
|||||||
"cfg-if",
|
"cfg-if",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "time"
|
|
||||||
version = "0.3.44"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
|
|
||||||
dependencies = [
|
|
||||||
"deranged",
|
|
||||||
"itoa",
|
|
||||||
"num-conv",
|
|
||||||
"powerfmt",
|
|
||||||
"serde",
|
|
||||||
"time-core",
|
|
||||||
"time-macros",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "time-core"
|
|
||||||
version = "0.1.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "time-macros"
|
|
||||||
version = "0.2.24"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3"
|
|
||||||
dependencies = [
|
|
||||||
"num-conv",
|
|
||||||
"time-core",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tinystr"
|
name = "tinystr"
|
||||||
version = "0.8.1"
|
version = "0.8.1"
|
||||||
@@ -1970,7 +1841,7 @@ checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"libc",
|
"libc",
|
||||||
"mio 1.0.4",
|
"mio 1.1.0",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"signal-hook-registry",
|
"signal-hook-registry",
|
||||||
@@ -2071,18 +1942,6 @@ dependencies = [
|
|||||||
"tracing-core",
|
"tracing-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tracing-appender"
|
|
||||||
version = "0.2.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
|
|
||||||
dependencies = [
|
|
||||||
"crossbeam-channel",
|
|
||||||
"thiserror",
|
|
||||||
"time",
|
|
||||||
"tracing-subscriber",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tracing-attributes"
|
name = "tracing-attributes"
|
||||||
version = "0.1.30"
|
version = "0.1.30"
|
||||||
@@ -2230,15 +2089,6 @@ version = "0.11.1+wasi-snapshot-preview1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "wasi"
|
|
||||||
version = "0.14.7+wasi-0.2.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
|
|
||||||
dependencies = [
|
|
||||||
"wasip2",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasip2"
|
name = "wasip2"
|
||||||
version = "1.0.1+wasi-0.2.4"
|
version = "1.0.1+wasi-0.2.4"
|
||||||
|
|||||||
48
Cargo.toml
48
Cargo.toml
@@ -1,8 +1,44 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = ["agent", "dashboard", "shared"]
|
||||||
"dashboard",
|
|
||||||
"agent",
|
|
||||||
"shared"
|
|
||||||
]
|
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
default-members = ["dashboard"]
|
|
||||||
|
[workspace.dependencies]
|
||||||
|
# Async runtime
|
||||||
|
tokio = { version = "1.0", features = ["full"] }
|
||||||
|
|
||||||
|
# Serialization
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
|
||||||
|
# Error handling
|
||||||
|
thiserror = "1.0"
|
||||||
|
anyhow = "1.0"
|
||||||
|
|
||||||
|
# Time handling
|
||||||
|
chrono = { version = "0.4", features = ["serde"] }
|
||||||
|
|
||||||
|
# CLI
|
||||||
|
clap = { version = "4.0", features = ["derive"] }
|
||||||
|
|
||||||
|
# ZMQ communication
|
||||||
|
zmq = "0.10"
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
tracing = "0.1"
|
||||||
|
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||||
|
|
||||||
|
# TUI (dashboard only)
|
||||||
|
ratatui = "0.24"
|
||||||
|
crossterm = "0.27"
|
||||||
|
|
||||||
|
# Email (agent only)
|
||||||
|
lettre = { version = "0.11", default-features = false, features = ["smtp-transport", "builder"] }
|
||||||
|
|
||||||
|
# System utilities (agent only)
|
||||||
|
gethostname = "0.4"
|
||||||
|
|
||||||
|
# Configuration parsing
|
||||||
|
toml = "0.8"
|
||||||
|
|
||||||
|
# Shared local dependencies
|
||||||
|
cm-dashboard-shared = { path = "./shared" }
|
||||||
742
README.md
742
README.md
@@ -1,544 +1,362 @@
|
|||||||
# CM Dashboard - Infrastructure Monitoring TUI
|
# CM Dashboard
|
||||||
|
|
||||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for specific monitoring needs and API integrations. Features real-time monitoring of all infrastructure components with intelligent email notifications and automatic status calculation.
|
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built with ZMQ-based metric collection and individual metrics architecture.
|
||||||
|
|
||||||
### System Widget
|
## Features
|
||||||
```
|
|
||||||
┌System───────────────────────────────────────────────────────┐
|
|
||||||
│ Memory usage │
|
|
||||||
│✔ 3.0 / 7.8 GB │
|
|
||||||
│ CPU load CPU temp │
|
|
||||||
│✔ 1.05 • 0.96 • 0.58 64.0°C │
|
|
||||||
│ C1E C3 C6 C8 C9 C10 │
|
|
||||||
│✔ 0.5% 0.5% 10.4% 10.2% 0.4% 77.9% │
|
|
||||||
│ GPU load GPU temp │
|
|
||||||
│✔ — — │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Services Widget (Enhanced)
|
### Core Monitoring
|
||||||
```
|
- **Real-time metrics**: CPU, RAM, Storage, and Service status
|
||||||
┌Services────────────────────────────────────────────────────┐
|
- **Multi-host support**: Monitor multiple servers from single dashboard
|
||||||
│ Service Memory (GB) CPU Disk │
|
- **Service management**: Start/stop services with intelligent status tracking
|
||||||
│✔ Service Memory 7.1/23899.7 MiB — │
|
- **NixOS integration**: System rebuild via SSH + tmux popup
|
||||||
│✔ Disk Usage — — 45/100 GB │
|
- **Backup monitoring**: Borgbackup status and scheduling
|
||||||
│⚠ CPU Load — 2.18 — │
|
- **Email notifications**: Intelligent batching prevents spam
|
||||||
│✔ CPU Temperature — 47.0°C — │
|
|
||||||
│✔ docker-registry 0.0 GB 0.0% <1 MB │
|
|
||||||
│✔ gitea 0.4/4.1 GB 0.2% 970 MB │
|
|
||||||
│ 1 active connections │
|
|
||||||
│✔ nginx 0.0/1.0 GB 0.0% <1 MB │
|
|
||||||
│✔ ├─ docker.cmtec.se │
|
|
||||||
│✔ ├─ git.cmtec.se │
|
|
||||||
│✔ ├─ gitea.cmtec.se │
|
|
||||||
│✔ ├─ haasp.cmtec.se │
|
|
||||||
│✔ ├─ pages.cmtec.se │
|
|
||||||
│✔ └─ www.kryddorten.se │
|
|
||||||
│✔ postgresql 0.1 GB 0.0% 378 MB │
|
|
||||||
│ 1 active connections │
|
|
||||||
│✔ redis-immich 0.0 GB 0.4% <1 MB │
|
|
||||||
│✔ sshd 0.0 GB 0.0% <1 MB │
|
|
||||||
│ 1 SSH connection │
|
|
||||||
│✔ unifi 0.9/2.0 GB 0.4% 391 MB │
|
|
||||||
└────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Storage Widget
|
### User-Stopped Service Tracking
|
||||||
```
|
Services stopped via the dashboard are intelligently tracked to prevent false alerts:
|
||||||
┌Storage──────────────────────────────────────────────────────┐
|
|
||||||
│ Drive Temp Wear Spare Hours Capacity Usage │
|
|
||||||
│✔ nvme0n1 57°C 4% 100% 11463 932G 23G (2%) │
|
|
||||||
│ │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Backups Widget
|
- **Smart status reporting**: User-stopped services show as Status::OK instead of Warning
|
||||||
```
|
- **Persistent storage**: Tracking survives agent restarts via JSON storage
|
||||||
┌Backups──────────────────────────────────────────────────────┐
|
- **Automatic management**: Flags cleared when services restarted via dashboard
|
||||||
│ Backup Status Details │
|
- **Maintenance friendly**: No false alerts during intentional service operations
|
||||||
│✔ Latest 3h ago 1.4 GiB │
|
|
||||||
│ 8 archives, 2.4 GiB total │
|
|
||||||
│✔ Disk ok 2.4/468 GB (1%) │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Hosts Widget
|
|
||||||
```
|
|
||||||
┌Hosts────────────────────────────────────────────────────────┐
|
|
||||||
│ Host Status Timestamp │
|
|
||||||
│✔ cmbox ok 2025-10-13 05:45:28 │
|
|
||||||
│✔ srv01 ok 2025-10-13 05:45:28 │
|
|
||||||
│? labbox No data received — │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Navigation**: `←→` hosts, `r` refresh, `q` quit
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
### Real-time Monitoring
|
|
||||||
- **Multi-host support** for cmbox, labbox, simonbox, steambox, srv01
|
|
||||||
- **Performance-focused** with minimal resource usage
|
|
||||||
- **Keyboard-driven interface** for power users
|
|
||||||
- **ZMQ gossip network** for efficient data distribution
|
|
||||||
|
|
||||||
### Infrastructure Monitoring
|
|
||||||
- **NVMe health monitoring** with wear prediction and temperature tracking
|
|
||||||
- **CPU/Memory/GPU telemetry** with automatic thresholding
|
|
||||||
- **Service resource monitoring** with per-service CPU and RAM usage
|
|
||||||
- **Disk usage overview** for root filesystems
|
|
||||||
- **Backup status** with detailed metrics and history
|
|
||||||
- **C-state monitoring** for CPU power management analysis
|
|
||||||
|
|
||||||
### Intelligent Alerting
|
|
||||||
- **Agent-calculated status** with predefined thresholds
|
|
||||||
- **Email notifications** via SMTP with rate limiting
|
|
||||||
- **Recovery notifications** with context about original issues
|
|
||||||
- **Stockholm timezone** support for email timestamps
|
|
||||||
- **Unified alert pipeline** summarizing host health
|
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
### Agent-Dashboard Separation
|
### Individual Metrics Philosophy
|
||||||
The system follows a strict separation of concerns:
|
- **Agent**: Collects individual metrics, calculates status using thresholds
|
||||||
|
- **Dashboard**: Subscribes to specific metrics, composes widgets from individual data
|
||||||
|
- **ZMQ Communication**: Efficient real-time metric transmission
|
||||||
|
- **Status Aggregation**: Host-level status calculated from all service metrics
|
||||||
|
|
||||||
- **Agent**: Single source of truth for all status calculations using defined thresholds
|
### Components
|
||||||
- **Dashboard**: Display-only interface that shows agent-provided status
|
|
||||||
- **Data Flow**: Agent (calculations) → Status → Dashboard (display) → Colors
|
|
||||||
|
|
||||||
### Agent Thresholds (Production)
|
```
|
||||||
- **CPU Load**: Warning ≥ 5.0, Critical ≥ 8.0
|
┌─────────────────┐ ZMQ ┌─────────────────┐
|
||||||
- **Memory Usage**: Warning ≥ 80%, Critical ≥ 95%
|
│ │◄──────────►│ │
|
||||||
- **CPU Temperature**: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled)
|
│ Agent │ Metrics │ Dashboard │
|
||||||
|
│ - Collectors │ │ - TUI │
|
||||||
### Email Notification System
|
│ - Status │ │ - Widgets │
|
||||||
- **From**: `{hostname}@cmtec.se` (e.g., cmbox@cmtec.se)
|
│ - Tracking │ │ - Commands │
|
||||||
- **To**: `cm@cmtec.se`
|
│ │ │ │
|
||||||
- **SMTP**: localhost:25 (postfix)
|
└─────────────────┘ └─────────────────┘
|
||||||
- **Rate Limiting**: 30 minutes (configurable)
|
│ │
|
||||||
- **Triggers**: Status degradation and recovery with detailed context
|
▼ ▼
|
||||||
|
┌─────────────────┐ ┌─────────────────┐
|
||||||
## Installation
|
│ JSON Storage │ │ SSH + tmux │
|
||||||
|
│ - User-stopped │ │ - Remote rebuild│
|
||||||
### Requirements
|
│ - Cache │ │ - Process │
|
||||||
- Rust toolchain 1.75+ (install via [`rustup`](https://rustup.rs))
|
│ - State │ │ isolation │
|
||||||
- Root privileges for agent (hardware monitoring access)
|
└─────────────────┘ └─────────────────┘
|
||||||
- Network access for ZMQ communication (default port 6130)
|
|
||||||
- SMTP server for notifications (postfix recommended)
|
|
||||||
|
|
||||||
### Build from Source
|
|
||||||
```bash
|
|
||||||
git clone https://github.com/cmtec/cm-dashboard.git
|
|
||||||
cd cm-dashboard
|
|
||||||
cargo build --release
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Optimized binaries available at:
|
### Service Control Flow
|
||||||
- Dashboard: `target/release/cm-dashboard`
|
|
||||||
- Agent: `target/release/cm-dashboard-agent`
|
|
||||||
|
|
||||||
### Installation
|
1. **User Action**: Dashboard sends `UserStart`/`UserStop` commands
|
||||||
```bash
|
2. **Agent Processing**:
|
||||||
# Install dashboard
|
- Marks service as user-stopped (if stopping)
|
||||||
cargo install --path dashboard
|
- Executes `systemctl start/stop service`
|
||||||
|
- Syncs state to global tracker
|
||||||
|
3. **Status Calculation**:
|
||||||
|
- Systemd collector checks user-stopped flag
|
||||||
|
- Reports Status::OK for user-stopped inactive services
|
||||||
|
- Normal Warning status for system failures
|
||||||
|
|
||||||
|
## Interface
|
||||||
|
|
||||||
# Install agent (requires root for hardware access)
|
|
||||||
sudo cargo install --path agent
|
|
||||||
```
|
```
|
||||||
|
cm-dashboard • ● cmbox ● srv01 ● srv02 ● steambox
|
||||||
|
┌system──────────────────────────────┐┌services─────────────────────────────────────────┐
|
||||||
|
│NixOS: ││Service: Status: RAM: Disk: │
|
||||||
|
│Build: 25.05.20251004.3bcc93c ││● docker active 27M 496MB │
|
||||||
|
│Agent: v0.1.43 ││● gitea active 579M 2.6GB │
|
||||||
|
│Active users: cm, simon ││● nginx active 28M 24MB │
|
||||||
|
│CPU: ││ ├─ ● gitea.cmtec.se 51ms │
|
||||||
|
│● Load: 0.10 0.52 0.88 • 3000MHz ││ ├─ ● photos.cmtec.se 41ms │
|
||||||
|
│RAM: ││● postgresql active 112M 357MB │
|
||||||
|
│● Usage: 33% 2.6GB/7.6GB ││● redis-immich user-stopped │
|
||||||
|
│● /tmp: 0% 0B/2.0GB ││● sshd active 2M 0 │
|
||||||
|
│Storage: ││● unifi active 594M 495MB │
|
||||||
|
│● root (Single): ││ │
|
||||||
|
│ ├─ ● nvme0n1 W: 1% ││ │
|
||||||
|
│ └─ ● 18% 167.4GB/928.2GB ││ │
|
||||||
|
└────────────────────────────────────┘└─────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### Navigation
|
||||||
|
- **Tab**: Switch between hosts
|
||||||
|
- **↑↓ or j/k**: Navigate services
|
||||||
|
- **s**: Start selected service (UserStart)
|
||||||
|
- **S**: Stop selected service (UserStop)
|
||||||
|
- **J**: Show service logs (journalctl in tmux popup)
|
||||||
|
- **R**: Rebuild current host
|
||||||
|
- **q**: Quit
|
||||||
|
|
||||||
|
### Status Indicators
|
||||||
|
- **Green ●**: Active service
|
||||||
|
- **Yellow ◐**: Inactive service (system issue)
|
||||||
|
- **Red ◯**: Failed service
|
||||||
|
- **Blue arrows**: Service transitioning (↑ starting, ↓ stopping, ↻ restarting)
|
||||||
|
- **"user-stopped"**: Service stopped via dashboard (Status::OK)
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### Dashboard
|
### Building
|
||||||
```bash
|
|
||||||
# Run with default configuration
|
|
||||||
cm-dashboard
|
|
||||||
|
|
||||||
# Specify host to monitor
|
|
||||||
cm-dashboard --host cmbox
|
|
||||||
|
|
||||||
# Override ZMQ endpoints
|
|
||||||
cm-dashboard --zmq-endpoint tcp://srv01:6130,tcp://labbox:6130
|
|
||||||
|
|
||||||
# Increase logging verbosity
|
|
||||||
cm-dashboard -v
|
|
||||||
```
|
|
||||||
|
|
||||||
### Agent (Pure Auto-Discovery)
|
|
||||||
The agent requires **no configuration files** and auto-discovers all system components:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Basic agent startup (auto-detects everything)
|
# With Nix (recommended)
|
||||||
sudo cm-dashboard-agent
|
nix-shell -p openssl pkg-config --run "cargo build --workspace"
|
||||||
|
|
||||||
# With verbose logging for troubleshooting
|
# Or with system dependencies
|
||||||
sudo cm-dashboard-agent -v
|
sudo apt install libssl-dev pkg-config # Ubuntu/Debian
|
||||||
|
cargo build --workspace
|
||||||
```
|
```
|
||||||
|
|
||||||
The agent automatically:
|
### Running
|
||||||
- **Discovers storage devices** for SMART monitoring
|
|
||||||
- **Detects running systemd services** for resource tracking
|
```bash
|
||||||
- **Configures collection intervals** based on system capabilities
|
# Start agent (requires configuration)
|
||||||
- **Sets up email notifications** using hostname@cmtec.se
|
./target/debug/cm-dashboard-agent --config /etc/cm-dashboard/agent.toml
|
||||||
|
|
||||||
|
# Start dashboard (inside tmux session)
|
||||||
|
tmux
|
||||||
|
./target/debug/cm-dashboard --config /etc/cm-dashboard/dashboard.toml
|
||||||
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
### Dashboard Configuration
|
### Agent Configuration
|
||||||
The dashboard creates `config/dashboard.toml` on first run:
|
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
[hosts]
|
collection_interval_seconds = 2
|
||||||
default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
[zmq]
|
||||||
name = "srv01"
|
publisher_port = 6130
|
||||||
|
command_port = 6131
|
||||||
|
bind_address = "0.0.0.0"
|
||||||
|
transmission_interval_seconds = 2
|
||||||
|
|
||||||
|
[collectors.cpu]
|
||||||
enabled = true
|
enabled = true
|
||||||
|
interval_seconds = 2
|
||||||
|
load_warning_threshold = 5.0
|
||||||
|
load_critical_threshold = 10.0
|
||||||
|
|
||||||
[[hosts.hosts]]
|
[collectors.memory]
|
||||||
name = "cmbox"
|
|
||||||
enabled = true
|
enabled = true
|
||||||
|
interval_seconds = 2
|
||||||
|
usage_warning_percent = 80.0
|
||||||
|
usage_critical_percent = 90.0
|
||||||
|
|
||||||
[dashboard]
|
[collectors.systemd]
|
||||||
tick_rate_ms = 250
|
enabled = true
|
||||||
history_duration_minutes = 60
|
interval_seconds = 10
|
||||||
|
service_name_filters = ["nginx*", "postgresql*", "docker*", "sshd*"]
|
||||||
|
excluded_services = ["nginx-config-reload", "systemd-", "getty@"]
|
||||||
|
nginx_latency_critical_ms = 1000.0
|
||||||
|
http_timeout_seconds = 10
|
||||||
|
|
||||||
[data_source]
|
[notifications]
|
||||||
kind = "zmq"
|
enabled = true
|
||||||
|
smtp_host = "localhost"
|
||||||
[data_source.zmq]
|
smtp_port = 25
|
||||||
endpoints = ["tcp://127.0.0.1:6130"]
|
from_email = "{hostname}@example.com"
|
||||||
|
to_email = "admin@example.com"
|
||||||
|
aggregation_interval_seconds = 30
|
||||||
```
|
```
|
||||||
|
|
||||||
### Agent Configuration (Optional)
|
### Dashboard Configuration
|
||||||
The agent works without configuration but supports optional settings:
|
|
||||||
|
```toml
|
||||||
|
[zmq]
|
||||||
|
subscriber_ports = [6130]
|
||||||
|
|
||||||
|
[hosts]
|
||||||
|
predefined_hosts = ["cmbox", "srv01", "srv02"]
|
||||||
|
|
||||||
|
[ui]
|
||||||
|
ssh_user = "cm"
|
||||||
|
rebuild_alias = "nixos-rebuild-cmtec"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Technical Implementation
|
||||||
|
|
||||||
|
### Collectors
|
||||||
|
|
||||||
|
#### Systemd Collector
|
||||||
|
- **Service Discovery**: Uses `systemctl list-unit-files` + `list-units --all`
|
||||||
|
- **Status Calculation**: Checks user-stopped flag before assigning Warning status
|
||||||
|
- **Memory Tracking**: Per-service memory usage via `systemctl show`
|
||||||
|
- **Sub-services**: Nginx site latency, Docker containers
|
||||||
|
- **User-stopped Integration**: `UserStoppedServiceTracker::is_service_user_stopped()`
|
||||||
|
|
||||||
|
#### User-Stopped Service Tracker
|
||||||
|
- **Storage**: `/var/lib/cm-dashboard/user-stopped-services.json`
|
||||||
|
- **Thread Safety**: Global singleton with `Arc<Mutex<>>`
|
||||||
|
- **Persistence**: Automatic save on state changes
|
||||||
|
- **Global Access**: Static methods for collector integration
|
||||||
|
|
||||||
|
#### Other Collectors
|
||||||
|
- **CPU**: Load average, temperature, frequency monitoring
|
||||||
|
- **Memory**: RAM/swap usage, tmpfs monitoring
|
||||||
|
- **Disk**: Filesystem usage, SMART health data
|
||||||
|
- **NixOS**: Build version, active users, agent version
|
||||||
|
- **Backup**: Borgbackup repository status and metrics
|
||||||
|
|
||||||
|
### ZMQ Protocol
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// Metric Message
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct MetricMessage {
|
||||||
|
pub hostname: String,
|
||||||
|
pub timestamp: u64,
|
||||||
|
pub metrics: Vec<Metric>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Service Commands
|
||||||
|
pub enum AgentCommand {
|
||||||
|
ServiceControl {
|
||||||
|
service_name: String,
|
||||||
|
action: ServiceAction,
|
||||||
|
},
|
||||||
|
SystemRebuild { /* SSH config */ },
|
||||||
|
CollectNow,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ServiceAction {
|
||||||
|
Start, // System-initiated
|
||||||
|
Stop, // System-initiated
|
||||||
|
UserStart, // User via dashboard (clears user-stopped)
|
||||||
|
UserStop, // User via dashboard (marks user-stopped)
|
||||||
|
Status,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Maintenance Mode
|
||||||
|
|
||||||
|
Suppress notifications during planned maintenance:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Generate example configuration
|
# Enable maintenance mode
|
||||||
cm-dashboard-agent --help
|
touch /tmp/cm-maintenance
|
||||||
|
|
||||||
# Override specific settings
|
# Perform maintenance
|
||||||
sudo cm-dashboard-agent \
|
systemctl stop service
|
||||||
--hostname cmbox \
|
# ... work ...
|
||||||
--bind tcp://*:6130 \
|
systemctl start service
|
||||||
--interval 5000
|
|
||||||
|
# Disable maintenance mode
|
||||||
|
rm /tmp/cm-maintenance
|
||||||
```
|
```
|
||||||
|
|
||||||
## Widget Layout
|
|
||||||
|
|
||||||
### Services Widget Structure
|
|
||||||
The Services widget now displays both system metrics and services in a unified table:
|
|
||||||
|
|
||||||
```
|
|
||||||
┌Services────────────────────────────────────────────────────┐
|
|
||||||
│ Service Memory (GB) CPU Disk │
|
|
||||||
│✔ Service Memory 7.1/23899.7 MiB — │ ← System metric as service row
|
|
||||||
│✔ Disk Usage — — 45/100 GB │ ← System metric as service row
|
|
||||||
│⚠ CPU Load — 2.18 — │ ← System metric as service row
|
|
||||||
│✔ CPU Temperature — 47.0°C — │ ← System metric as service row
|
|
||||||
│✔ docker-registry 0.0 GB 0.0% <1 MB │ ← Regular service
|
|
||||||
│✔ nginx 0.0/1.0 GB 0.0% <1 MB │ ← Regular service
|
|
||||||
│✔ ├─ docker.cmtec.se │ ← Nginx site (sub-service)
|
|
||||||
│✔ ├─ git.cmtec.se │ ← Nginx site (sub-service)
|
|
||||||
│✔ └─ gitea.cmtec.se │ ← Nginx site (sub-service)
|
|
||||||
│✔ sshd 0.0 GB 0.0% <1 MB │ ← Regular service
|
|
||||||
│ 1 SSH connection │ ← Service description
|
|
||||||
└────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Row Types:**
|
|
||||||
- **System Metrics**: CPU Load, Service Memory, Disk Usage, CPU Temperature with status indicators
|
|
||||||
- **Regular Services**: Full resource data (memory, CPU, disk) with optional description lines
|
|
||||||
- **Sub-services**: Nginx sites with tree structure, status indicators only (no resource columns)
|
|
||||||
- **Description Lines**: Connection counts and service-specific info without status indicators
|
|
||||||
|
|
||||||
### Hosts Widget (formerly Alerts)
|
|
||||||
The Hosts widget provides a summary view of all monitored hosts:
|
|
||||||
|
|
||||||
```
|
|
||||||
┌Hosts────────────────────────────────────────────────────────┐
|
|
||||||
│ Host Status Timestamp │
|
|
||||||
│✔ cmbox ok 2025-10-13 05:45:28 │
|
|
||||||
│✔ srv01 ok 2025-10-13 05:45:28 │
|
|
||||||
│? labbox No data received — │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Monitoring Components
|
|
||||||
|
|
||||||
### System Collector
|
|
||||||
- **CPU Load**: 1/5/15 minute averages with warning/critical thresholds
|
|
||||||
- **Memory Usage**: Used/total with percentage calculation
|
|
||||||
- **CPU Temperature**: x86_pkg_temp prioritized for accuracy
|
|
||||||
- **C-States**: Power management state distribution (C0-C10)
|
|
||||||
|
|
||||||
### Service Collector
|
|
||||||
- **System Metrics as Services**: CPU Load, Service Memory, Disk Usage, CPU Temperature displayed as individual service rows
|
|
||||||
- **Systemd Services**: Auto-discovery of interesting services with resource monitoring
|
|
||||||
- **Nginx Site Monitoring**: Individual rows for each nginx virtual host with tree structure (`├─` and `└─`)
|
|
||||||
- **Resource Usage**: Per-service memory, CPU, and disk consumption
|
|
||||||
- **Service Health**: Running/stopped/degraded status with detailed failure info
|
|
||||||
- **Connection Tracking**: SSH connections, database connections as description lines
|
|
||||||
|
|
||||||
### SMART Collector
|
|
||||||
- **NVMe Health**: Temperature, wear leveling, spare blocks
|
|
||||||
- **Drive Capacity**: Total/used space with percentage
|
|
||||||
- **SMART Attributes**: Critical health indicators
|
|
||||||
|
|
||||||
### Backup Collector
|
|
||||||
- **Restic Integration**: Backup status and history
|
|
||||||
- **Health Monitoring**: Success/failure tracking
|
|
||||||
- **Storage Metrics**: Backup size and retention
|
|
||||||
|
|
||||||
## Keyboard Controls
|
|
||||||
|
|
||||||
| Key | Action |
|
|
||||||
|-----|--------|
|
|
||||||
| `←` / `h` | Previous host |
|
|
||||||
| `→` / `l` / `Tab` | Next host |
|
|
||||||
| `?` | Toggle help overlay |
|
|
||||||
| `r` | Force refresh |
|
|
||||||
| `q` / `Esc` | Quit |
|
|
||||||
|
|
||||||
## Email Notifications
|
## Email Notifications
|
||||||
|
|
||||||
### Notification Triggers
|
### Intelligent Batching
|
||||||
- **Status Degradation**: Any status change to warning/critical
|
- **Real-time dashboard**: Immediate status updates
|
||||||
- **Recovery**: Warning/critical status returning to ok
|
- **Batched emails**: Aggregated every 30 seconds
|
||||||
- **Service Failures**: Individual service stop/start events
|
- **Smart grouping**: Services organized by severity
|
||||||
|
- **Recovery suppression**: Reduces notification spam
|
||||||
|
|
||||||
### Example Recovery Email
|
### Example Alert
|
||||||
```
|
```
|
||||||
✅ RESOLVED: system cpu on cmbox
|
Subject: Status Alert: 1 critical, 2 warnings, 0 recoveries
|
||||||
|
|
||||||
Status Change Alert
|
Status Summary (30s duration)
|
||||||
|
Host Status: Ok → Warning
|
||||||
|
|
||||||
Host: cmbox
|
🔴 CRITICAL ISSUES (1):
|
||||||
Component: system
|
postgresql: Ok → Critical (memory usage 95%)
|
||||||
Metric: cpu
|
|
||||||
Status Change: warning → ok
|
|
||||||
Time: 2025-10-12 22:15:30 CET
|
|
||||||
|
|
||||||
Details:
|
🟡 WARNINGS (2):
|
||||||
Recovered from: CPU load (1/5/15min): 6.20 / 5.80 / 4.50
|
nginx: Ok → Warning (high load 8.5)
|
||||||
Current status: CPU load (1/5/15min): 3.30 / 3.17 / 2.84
|
redis: user-stopped → Warning (restarted by system)
|
||||||
|
|
||||||
|
✅ RECOVERIES (0):
|
||||||
|
|
||||||
--
|
--
|
||||||
CM Dashboard Agent
|
CM Dashboard Agent v0.1.43
|
||||||
Generated at 2025-10-12 22:15:30 CET
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rate Limiting
|
|
||||||
- **Default**: 30 minutes between notifications per component
|
|
||||||
- **Testing**: Set to 0 for immediate notifications
|
|
||||||
- **Configurable**: Adjustable per deployment needs
|
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
### Project Structure
|
### Project Structure
|
||||||
```
|
```
|
||||||
cm-dashboard/
|
cm-dashboard/
|
||||||
├── agent/ # Monitoring agent
|
├── agent/ # Metrics collection agent
|
||||||
│ ├── src/
|
│ ├── src/
|
||||||
│ │ ├── collectors/ # Data collection modules
|
│ │ ├── collectors/ # CPU, memory, disk, systemd, backup, nixos
|
||||||
│ │ ├── notifications.rs # Email notification system
|
│ │ ├── service_tracker.rs # User-stopped service tracking
|
||||||
│ │ └── simple_agent.rs # Main agent logic
|
│ │ ├── status/ # Status aggregation and notifications
|
||||||
├── dashboard/ # TUI dashboard
|
│ │ ├── config/ # TOML configuration loading
|
||||||
|
│ │ └── communication/ # ZMQ message handling
|
||||||
|
├── dashboard/ # TUI dashboard application
|
||||||
│ ├── src/
|
│ ├── src/
|
||||||
│ │ ├── ui/ # Widget implementations
|
│ │ ├── ui/widgets/ # CPU, memory, services, backup, system
|
||||||
│ │ ├── data/ # Data structures
|
│ │ ├── communication/ # ZMQ consumption and commands
|
||||||
│ │ └── app.rs # Application state
|
│ │ └── app.rs # Main application loop
|
||||||
├── shared/ # Common data structures
|
├── shared/ # Shared types and utilities
|
||||||
└── config/ # Configuration files
|
│ └── src/
|
||||||
|
│ ├── metrics.rs # Metric, Status, StatusTracker types
|
||||||
|
│ ├── protocol.rs # ZMQ message format
|
||||||
|
│ └── cache.rs # Cache configuration
|
||||||
|
└── CLAUDE.md # Development guidelines and rules
|
||||||
```
|
```
|
||||||
|
|
||||||
### Development Commands
|
### Testing
|
||||||
```bash
|
```bash
|
||||||
# Format code
|
# Build and test
|
||||||
cargo fmt
|
nix-shell -p openssl pkg-config --run "cargo build --workspace"
|
||||||
|
nix-shell -p openssl pkg-config --run "cargo test --workspace"
|
||||||
|
|
||||||
# Check all packages
|
# Code quality
|
||||||
cargo check
|
cargo fmt --all
|
||||||
|
cargo clippy --workspace -- -D warnings
|
||||||
# Run tests
|
|
||||||
cargo test
|
|
||||||
|
|
||||||
# Build release
|
|
||||||
cargo build --release
|
|
||||||
|
|
||||||
# Run with logging
|
|
||||||
RUST_LOG=debug cargo run -p cm-dashboard-agent
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Architecture Principles
|
## Deployment
|
||||||
|
|
||||||
#### Status Calculation Rules
|
### Automated Binary Releases
|
||||||
- **Agent calculates all status** using predefined thresholds
|
|
||||||
- **Dashboard never calculates status** - only displays agent data
|
|
||||||
- **No hardcoded thresholds in dashboard** widgets
|
|
||||||
- **Use "unknown" when agent status missing** (never default to "ok")
|
|
||||||
|
|
||||||
#### Data Flow
|
|
||||||
```
|
|
||||||
System Metrics → Agent Collectors → Status Calculation → ZMQ → Dashboard → Display
|
|
||||||
↓
|
|
||||||
Email Notifications
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Pure Auto-Discovery
|
|
||||||
- **No config files required** for basic operation
|
|
||||||
- **Runtime discovery** of system capabilities
|
|
||||||
- **Service auto-detection** via systemd patterns
|
|
||||||
- **Storage device enumeration** via /sys filesystem
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Common Issues
|
|
||||||
|
|
||||||
#### Agent Won't Start
|
|
||||||
```bash
|
```bash
|
||||||
# Check permissions (agent requires root)
|
# Create new release
|
||||||
sudo cm-dashboard-agent -v
|
cd ~/projects/cm-dashboard
|
||||||
|
git tag v0.1.X
|
||||||
# Verify ZMQ binding
|
git push origin v0.1.X
|
||||||
sudo netstat -tulpn | grep 6130
|
|
||||||
|
|
||||||
# Check system access
|
|
||||||
sudo smartctl --scan
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Dashboard Connection Issues
|
This triggers automated:
|
||||||
```bash
|
- Static binary compilation with `RUSTFLAGS="-C target-feature=+crt-static"`
|
||||||
# Test ZMQ connectivity
|
- GitHub-style release creation
|
||||||
cm-dashboard --zmq-endpoint tcp://target-host:6130 -v
|
- Tarball upload to Gitea
|
||||||
|
|
||||||
# Check network connectivity
|
### NixOS Integration
|
||||||
telnet target-host 6130
|
Update `~/projects/nixosbox/hosts/common/cm-dashboard.nix`:
|
||||||
```
|
|
||||||
|
|
||||||
#### Email Notifications Not Working
|
|
||||||
```bash
|
|
||||||
# Check postfix status
|
|
||||||
sudo systemctl status postfix
|
|
||||||
|
|
||||||
# Test SMTP manually
|
|
||||||
telnet localhost 25
|
|
||||||
|
|
||||||
# Verify notification settings
|
|
||||||
sudo cm-dashboard-agent -v | grep notification
|
|
||||||
```
|
|
||||||
|
|
||||||
### Logging
|
|
||||||
Set `RUST_LOG=debug` for detailed logging:
|
|
||||||
```bash
|
|
||||||
RUST_LOG=debug sudo cm-dashboard-agent
|
|
||||||
RUST_LOG=debug cm-dashboard
|
|
||||||
```
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
MIT License - see LICENSE file for details.
|
|
||||||
|
|
||||||
## Contributing
|
|
||||||
|
|
||||||
1. Fork the repository
|
|
||||||
2. Create feature branch (`git checkout -b feature/amazing-feature`)
|
|
||||||
3. Commit changes (`git commit -m 'Add amazing feature'`)
|
|
||||||
4. Push to branch (`git push origin feature/amazing-feature`)
|
|
||||||
5. Open Pull Request
|
|
||||||
|
|
||||||
For bugs and feature requests, please use GitHub Issues.
|
|
||||||
|
|
||||||
## NixOS Integration
|
|
||||||
|
|
||||||
### Updating cm-dashboard in NixOS Configuration
|
|
||||||
|
|
||||||
When new code is pushed to the cm-dashboard repository, follow these steps to update the NixOS configuration:
|
|
||||||
|
|
||||||
#### 1. Get the Latest Commit Hash
|
|
||||||
```bash
|
|
||||||
# Get the latest commit from the API
|
|
||||||
curl -s "https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/commits?sha=main&limit=1" | head -20
|
|
||||||
|
|
||||||
# Or use git
|
|
||||||
git log --oneline -1
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 2. Update the NixOS Configuration
|
|
||||||
Edit `hosts/common/cm-dashboard.nix` and update the `rev` field:
|
|
||||||
```nix
|
```nix
|
||||||
src = pkgs.fetchFromGitea {
|
version = "v0.1.43";
|
||||||
domain = "gitea.cmtec.se";
|
src = pkgs.fetchurl {
|
||||||
owner = "cm";
|
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/${version}/cm-dashboard-linux-x86_64.tar.gz";
|
||||||
repo = "cm-dashboard";
|
sha256 = "sha256-HASH";
|
||||||
rev = "f786d054f2ece80823f85e46933857af96e241b2"; # Update this
|
|
||||||
hash = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="; # Reset temporarily
|
|
||||||
};
|
};
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Get the Correct Hash
|
Get hash via:
|
||||||
Build with placeholder hash to get the actual hash:
|
|
||||||
```bash
|
```bash
|
||||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchFromGitea {
|
cd ~/projects/nixosbox
|
||||||
domain = "gitea.cmtec.se";
|
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||||
owner = "cm";
|
url = "URL_HERE";
|
||||||
repo = "cm-dashboard";
|
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||||
rev = "YOUR_COMMIT_HASH";
|
|
||||||
hash = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
|
||||||
}' 2>&1 | grep "got:"
|
}' 2>&1 | grep "got:"
|
||||||
```
|
```
|
||||||
|
|
||||||
Example output:
|
## Monitoring Intervals
|
||||||
```
|
|
||||||
error: hash mismatch in fixed-output derivation '/nix/store/...':
|
|
||||||
specified: sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=
|
|
||||||
got: sha256-x8crxNusOUYRrkP9mYEOG+Ga3JCPIdJLkEAc5P1ZxdQ=
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 4. Update the Hash
|
- **Metrics Collection**: 2 seconds (CPU, memory, services)
|
||||||
Replace the placeholder with the correct hash from the error message (the "got:" line):
|
- **Metric Transmission**: 2 seconds (ZMQ publish)
|
||||||
```nix
|
- **Dashboard Updates**: 1 second (UI refresh)
|
||||||
hash = "sha256-vjy+j91iDCHUf0RE43anK4WZ+rKcyohP/3SykwZGof8="; # Use actual hash
|
- **Email Notifications**: 30 seconds (batched)
|
||||||
```
|
- **Disk Monitoring**: 300 seconds (5 minutes)
|
||||||
|
- **Service Discovery**: 300 seconds (5 minutes cache)
|
||||||
|
|
||||||
#### 5. Update Cargo Dependencies (if needed)
|
## License
|
||||||
If Cargo.lock has changed, you may need to update `cargoHash`:
|
|
||||||
```bash
|
|
||||||
# Build to get cargo hash error
|
|
||||||
nix-build --no-out-link --expr 'with import <nixpkgs> {}; rustPlatform.buildRustPackage rec {
|
|
||||||
pname = "cm-dashboard";
|
|
||||||
version = "0.1.0";
|
|
||||||
src = fetchFromGitea {
|
|
||||||
domain = "gitea.cmtec.se";
|
|
||||||
owner = "cm";
|
|
||||||
repo = "cm-dashboard";
|
|
||||||
rev = "YOUR_COMMIT_HASH";
|
|
||||||
hash = "YOUR_SOURCE_HASH";
|
|
||||||
};
|
|
||||||
cargoHash = "";
|
|
||||||
nativeBuildInputs = [ pkg-config ];
|
|
||||||
buildInputs = [ openssl ];
|
|
||||||
buildAndTestSubdir = ".";
|
|
||||||
cargoBuildFlags = [ "--workspace" ];
|
|
||||||
}' 2>&1 | grep "got:"
|
|
||||||
```
|
|
||||||
|
|
||||||
Then update `cargoHash` in the configuration.
|
MIT License - see LICENSE file for details.
|
||||||
|
|
||||||
#### 6. Commit the Changes
|
|
||||||
```bash
|
|
||||||
git add hosts/common/cm-dashboard.nix
|
|
||||||
git commit -m "Update cm-dashboard to latest version"
|
|
||||||
git push
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example Update Process
|
|
||||||
```bash
|
|
||||||
# 1. Get latest commit
|
|
||||||
LATEST_COMMIT=$(curl -s "https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/commits?sha=main&limit=1" | grep '"sha"' | head -1 | cut -d'"' -f4)
|
|
||||||
|
|
||||||
# 2. Get source hash
|
|
||||||
SOURCE_HASH=$(nix-build --no-out-link -E "with import <nixpkgs> {}; fetchFromGitea { domain = \"gitea.cmtec.se\"; owner = \"cm\"; repo = \"cm-dashboard\"; rev = \"$LATEST_COMMIT\"; hash = \"sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"; }" 2>&1 | grep "got:" | cut -d' ' -f12)
|
|
||||||
|
|
||||||
# 3. Update configuration and commit
|
|
||||||
echo "Latest commit: $LATEST_COMMIT"
|
|
||||||
echo "Source hash: $SOURCE_HASH"
|
|
||||||
```
|
|
||||||
@@ -1,25 +1,23 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cm-dashboard-agent"
|
name = "cm-dashboard-agent"
|
||||||
version = "0.1.0"
|
version = "0.1.50"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
cm-dashboard-shared = { path = "../shared" }
|
cm-dashboard-shared = { workspace = true }
|
||||||
anyhow = "1.0"
|
tokio = { workspace = true }
|
||||||
async-trait = "0.1"
|
serde = { workspace = true }
|
||||||
clap = { version = "4.0", features = ["derive"] }
|
serde_json = { workspace = true }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
thiserror = { workspace = true }
|
||||||
serde_json = "1.0"
|
anyhow = { workspace = true }
|
||||||
chrono = { version = "0.4", features = ["serde", "clock"] }
|
chrono = { workspace = true }
|
||||||
|
clap = { workspace = true }
|
||||||
|
zmq = { workspace = true }
|
||||||
|
tracing = { workspace = true }
|
||||||
|
tracing-subscriber = { workspace = true }
|
||||||
|
lettre = { workspace = true }
|
||||||
|
gethostname = { workspace = true }
|
||||||
chrono-tz = "0.8"
|
chrono-tz = "0.8"
|
||||||
thiserror = "1.0"
|
toml = { workspace = true }
|
||||||
tracing = "0.1"
|
async-trait = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
reqwest = { version = "0.11", features = ["json", "blocking"] }
|
||||||
tracing-appender = "0.2"
|
|
||||||
zmq = "0.10"
|
|
||||||
tokio = { version = "1.0", features = ["full", "process"] }
|
|
||||||
futures = "0.3"
|
|
||||||
rand = "0.8"
|
|
||||||
gethostname = "0.4"
|
|
||||||
lettre = { version = "0.11", default-features = false, features = ["smtp-transport", "builder"] }
|
|
||||||
reqwest = { version = "0.11", features = ["json"] }
|
|
||||||
371
agent/src/agent.rs
Normal file
371
agent/src/agent.rs
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use gethostname::gethostname;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::time::interval;
|
||||||
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
|
use crate::communication::{AgentCommand, ServiceAction, ZmqHandler};
|
||||||
|
use crate::config::AgentConfig;
|
||||||
|
use crate::metrics::MetricCollectionManager;
|
||||||
|
use crate::notifications::NotificationManager;
|
||||||
|
use crate::service_tracker::UserStoppedServiceTracker;
|
||||||
|
use crate::status::HostStatusManager;
|
||||||
|
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
|
||||||
|
|
||||||
|
pub struct Agent {
|
||||||
|
hostname: String,
|
||||||
|
config: AgentConfig,
|
||||||
|
zmq_handler: ZmqHandler,
|
||||||
|
metric_manager: MetricCollectionManager,
|
||||||
|
notification_manager: NotificationManager,
|
||||||
|
host_status_manager: HostStatusManager,
|
||||||
|
service_tracker: UserStoppedServiceTracker,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Agent {
|
||||||
|
pub async fn new(config_path: Option<String>) -> Result<Self> {
|
||||||
|
let hostname = gethostname().to_string_lossy().to_string();
|
||||||
|
info!("Initializing agent for host: {}", hostname);
|
||||||
|
|
||||||
|
// Load configuration (now required)
|
||||||
|
let config_path = config_path.ok_or_else(|| anyhow::anyhow!("Configuration file path is required"))?;
|
||||||
|
let config = AgentConfig::from_file(&config_path)?;
|
||||||
|
|
||||||
|
info!("Agent configuration loaded");
|
||||||
|
|
||||||
|
// Initialize ZMQ communication
|
||||||
|
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
|
||||||
|
info!(
|
||||||
|
"ZMQ communication initialized on port {}",
|
||||||
|
config.zmq.publisher_port
|
||||||
|
);
|
||||||
|
|
||||||
|
// Initialize metric collection manager with cache config
|
||||||
|
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
||||||
|
info!("Metric collection manager initialized");
|
||||||
|
|
||||||
|
// Initialize notification manager
|
||||||
|
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
|
||||||
|
info!("Notification manager initialized");
|
||||||
|
|
||||||
|
// Initialize host status manager
|
||||||
|
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
|
||||||
|
info!("Host status manager initialized");
|
||||||
|
|
||||||
|
// Initialize user-stopped service tracker
|
||||||
|
let service_tracker = UserStoppedServiceTracker::init_global()?;
|
||||||
|
info!("User-stopped service tracker initialized");
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
hostname,
|
||||||
|
config,
|
||||||
|
zmq_handler,
|
||||||
|
metric_manager,
|
||||||
|
notification_manager,
|
||||||
|
host_status_manager,
|
||||||
|
service_tracker,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
|
||||||
|
info!("Starting agent main loop with separated collection and transmission");
|
||||||
|
|
||||||
|
// CRITICAL: Collect ALL data immediately at startup before entering the loop
|
||||||
|
info!("Performing initial FORCE collection of all metrics at startup");
|
||||||
|
if let Err(e) = self.collect_all_metrics_force().await {
|
||||||
|
error!("Failed to collect initial metrics: {}", e);
|
||||||
|
} else {
|
||||||
|
info!("Initial metric collection completed - all data cached and ready");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Separate intervals for collection, transmission, and email notifications
|
||||||
|
let mut collection_interval =
|
||||||
|
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
||||||
|
let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds));
|
||||||
|
let mut notification_interval = interval(Duration::from_secs(self.config.notifications.aggregation_interval_seconds));
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = collection_interval.tick() => {
|
||||||
|
// Only collect and cache metrics, no ZMQ transmission
|
||||||
|
if let Err(e) = self.collect_metrics_only().await {
|
||||||
|
error!("Failed to collect metrics: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = transmission_interval.tick() => {
|
||||||
|
// Send all metrics via ZMQ (dashboard updates only)
|
||||||
|
if let Err(e) = self.broadcast_all_metrics().await {
|
||||||
|
error!("Failed to broadcast metrics: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = notification_interval.tick() => {
|
||||||
|
// Process batched email notifications (separate from dashboard updates)
|
||||||
|
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
||||||
|
error!("Failed to process pending notifications: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Handle incoming commands (check periodically)
|
||||||
|
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
||||||
|
if let Err(e) = self.handle_commands().await {
|
||||||
|
error!("Error handling commands: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = &mut shutdown_rx => {
|
||||||
|
info!("Shutdown signal received, stopping agent loop");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Agent main loop stopped");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn collect_all_metrics_force(&mut self) -> Result<()> {
|
||||||
|
info!("Starting FORCE metric collection for startup");
|
||||||
|
|
||||||
|
// Force collect all metrics from all collectors immediately
|
||||||
|
let metrics = self.metric_manager.collect_all_metrics_force().await?;
|
||||||
|
|
||||||
|
if metrics.is_empty() {
|
||||||
|
error!("No metrics collected during force collection!");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Force collected and cached {} metrics", metrics.len());
|
||||||
|
|
||||||
|
// Process metrics through status manager (collect status data at startup)
|
||||||
|
let _status_changed = self.process_metrics(&metrics).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn collect_metrics_only(&mut self) -> Result<()> {
|
||||||
|
debug!("Starting metric collection cycle (cache only)");
|
||||||
|
|
||||||
|
// Collect all metrics from all collectors and cache them
|
||||||
|
let metrics = self.metric_manager.collect_all_metrics().await?;
|
||||||
|
|
||||||
|
if metrics.is_empty() {
|
||||||
|
debug!("No metrics collected this cycle");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("Collected and cached {} metrics", metrics.len());
|
||||||
|
|
||||||
|
// Process metrics through status manager and trigger immediate transmission if status changed
|
||||||
|
let status_changed = self.process_metrics(&metrics).await;
|
||||||
|
|
||||||
|
if status_changed {
|
||||||
|
info!("Status change detected - triggering immediate metric transmission");
|
||||||
|
if let Err(e) = self.broadcast_all_metrics().await {
|
||||||
|
error!("Failed to broadcast metrics after status change: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn broadcast_all_metrics(&mut self) -> Result<()> {
|
||||||
|
debug!("Broadcasting cached metrics via ZMQ");
|
||||||
|
|
||||||
|
// Get cached metrics (no fresh collection)
|
||||||
|
let mut metrics = self.metric_manager.get_cached_metrics();
|
||||||
|
|
||||||
|
// Add the host status summary metric from status manager
|
||||||
|
let host_status_metric = self.host_status_manager.get_host_status_metric();
|
||||||
|
metrics.push(host_status_metric);
|
||||||
|
|
||||||
|
// Add agent version metric for cross-host version comparison
|
||||||
|
let version_metric = self.get_agent_version_metric();
|
||||||
|
metrics.push(version_metric);
|
||||||
|
|
||||||
|
// Check for user-stopped services that are now active and clear their flags
|
||||||
|
self.clear_user_stopped_flags_for_active_services(&metrics);
|
||||||
|
|
||||||
|
if metrics.is_empty() {
|
||||||
|
debug!("No metrics to broadcast");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("Broadcasting {} cached metrics (including host status summary)", metrics.len());
|
||||||
|
|
||||||
|
// Create and send message with all current data
|
||||||
|
let message = MetricMessage::new(self.hostname.clone(), metrics);
|
||||||
|
self.zmq_handler.publish_metrics(&message).await?;
|
||||||
|
|
||||||
|
debug!("Metrics broadcasted successfully");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process_metrics(&mut self, metrics: &[Metric]) -> bool {
|
||||||
|
let mut status_changed = false;
|
||||||
|
for metric in metrics {
|
||||||
|
if self.host_status_manager.process_metric(metric, &mut self.notification_manager).await {
|
||||||
|
status_changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
status_changed
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create agent version metric for cross-host version comparison
|
||||||
|
fn get_agent_version_metric(&self) -> Metric {
|
||||||
|
// Get version from executable path (same logic as main.rs get_version)
|
||||||
|
let version = self.get_agent_version();
|
||||||
|
|
||||||
|
Metric::new(
|
||||||
|
"agent_version".to_string(),
|
||||||
|
MetricValue::String(version),
|
||||||
|
Status::Ok,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get agent version from Cargo package version
|
||||||
|
fn get_agent_version(&self) -> String {
|
||||||
|
// Use the version from Cargo.toml (e.g., "0.1.11")
|
||||||
|
format!("v{}", env!("CARGO_PKG_VERSION"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_commands(&mut self) -> Result<()> {
|
||||||
|
// Try to receive commands (non-blocking)
|
||||||
|
match self.zmq_handler.try_receive_command() {
|
||||||
|
Ok(Some(command)) => {
|
||||||
|
info!("Received command: {:?}", command);
|
||||||
|
self.process_command(command).await?;
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
// No command available - this is normal
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Error receiving command: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
|
||||||
|
match command {
|
||||||
|
AgentCommand::CollectNow => {
|
||||||
|
info!("Processing CollectNow command");
|
||||||
|
if let Err(e) = self.collect_metrics_only().await {
|
||||||
|
error!("Failed to collect metrics on command: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AgentCommand::SetInterval { seconds } => {
|
||||||
|
info!("Processing SetInterval command: {} seconds", seconds);
|
||||||
|
// Note: This would require modifying the interval, which is complex
|
||||||
|
// For now, just log the request
|
||||||
|
info!("Interval change requested but not implemented yet");
|
||||||
|
}
|
||||||
|
AgentCommand::ToggleCollector { name, enabled } => {
|
||||||
|
info!(
|
||||||
|
"Processing ToggleCollector command: {} -> {}",
|
||||||
|
name, enabled
|
||||||
|
);
|
||||||
|
// Note: This would require dynamic collector management
|
||||||
|
info!("Collector toggle requested but not implemented yet");
|
||||||
|
}
|
||||||
|
AgentCommand::Ping => {
|
||||||
|
info!("Processing Ping command - agent is alive");
|
||||||
|
// Could send a response back via ZMQ if needed
|
||||||
|
}
|
||||||
|
AgentCommand::ServiceControl { service_name, action } => {
|
||||||
|
info!("Processing ServiceControl command: {} {:?}", service_name, action);
|
||||||
|
if let Err(e) = self.handle_service_control(&service_name, &action).await {
|
||||||
|
error!("Failed to execute service control: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle systemd service control commands
|
||||||
|
async fn handle_service_control(&mut self, service_name: &str, action: &ServiceAction) -> Result<()> {
|
||||||
|
let (action_str, is_user_action) = match action {
|
||||||
|
ServiceAction::Start => ("start", false),
|
||||||
|
ServiceAction::Stop => ("stop", false),
|
||||||
|
ServiceAction::Status => ("status", false),
|
||||||
|
ServiceAction::UserStart => ("start", true),
|
||||||
|
ServiceAction::UserStop => ("stop", true),
|
||||||
|
};
|
||||||
|
|
||||||
|
info!("Executing systemctl {} {} (user action: {})", action_str, service_name, is_user_action);
|
||||||
|
|
||||||
|
// Handle user-stopped service tracking before systemctl execution (stop only)
|
||||||
|
match action {
|
||||||
|
ServiceAction::UserStop => {
|
||||||
|
info!("Marking service '{}' as user-stopped", service_name);
|
||||||
|
if let Err(e) = self.service_tracker.mark_user_stopped(service_name) {
|
||||||
|
error!("Failed to mark service as user-stopped: {}", e);
|
||||||
|
} else {
|
||||||
|
// Sync to global tracker
|
||||||
|
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
let output = tokio::process::Command::new("sudo")
|
||||||
|
.arg("systemctl")
|
||||||
|
.arg(action_str)
|
||||||
|
.arg(format!("{}.service", service_name))
|
||||||
|
.output()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if output.status.success() {
|
||||||
|
info!("Service {} {} completed successfully", service_name, action_str);
|
||||||
|
if !output.stdout.is_empty() {
|
||||||
|
debug!("stdout: {}", String::from_utf8_lossy(&output.stdout));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: User-stopped flag will be cleared by systemd collector
|
||||||
|
// when service actually reaches 'active' state, not here
|
||||||
|
} else {
|
||||||
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
error!("Service {} {} failed: {}", service_name, action_str, stderr);
|
||||||
|
return Err(anyhow::anyhow!("systemctl {} {} failed: {}", action_str, service_name, stderr));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force refresh metrics after service control to update service status
|
||||||
|
if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::UserStart | ServiceAction::UserStop) {
|
||||||
|
info!("Triggering immediate metric refresh after service control");
|
||||||
|
if let Err(e) = self.collect_metrics_only().await {
|
||||||
|
error!("Failed to refresh metrics after service control: {}", e);
|
||||||
|
} else {
|
||||||
|
info!("Service status refreshed immediately after {} {}", action_str, service_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check metrics for user-stopped services that are now active and clear their flags
|
||||||
|
fn clear_user_stopped_flags_for_active_services(&mut self, metrics: &[Metric]) {
|
||||||
|
for metric in metrics {
|
||||||
|
// Look for service status metrics that are active
|
||||||
|
if metric.name.starts_with("service_") && metric.name.ends_with("_status") {
|
||||||
|
if let MetricValue::String(status) = &metric.value {
|
||||||
|
if status == "active" {
|
||||||
|
// Extract service name from metric name (service_nginx_status -> nginx)
|
||||||
|
let service_name = metric.name
|
||||||
|
.strip_prefix("service_")
|
||||||
|
.and_then(|s| s.strip_suffix("_status"))
|
||||||
|
.unwrap_or("");
|
||||||
|
|
||||||
|
if !service_name.is_empty() && UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
||||||
|
info!("Service '{}' is now active - clearing user-stopped flag", service_name);
|
||||||
|
if let Err(e) = self.service_tracker.clear_user_stopped(service_name) {
|
||||||
|
error!("Failed to clear user-stopped flag for '{}': {}", service_name, e);
|
||||||
|
} else {
|
||||||
|
// Sync to global tracker
|
||||||
|
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
||||||
|
debug!("Cleared user-stopped flag for service '{}'", service_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -1,479 +1,435 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::Utc;
|
||||||
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::json;
|
use std::collections::HashMap;
|
||||||
use std::process::Stdio;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tokio::process::Command;
|
|
||||||
use tokio::time::timeout;
|
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
|
|
||||||
use super::{AgentType, Collector, CollectorError, CollectorOutput};
|
use super::{Collector, CollectorError};
|
||||||
|
use tracing::error;
|
||||||
|
|
||||||
|
/// Backup collector that reads TOML status files for borgbackup metrics
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct BackupCollector {
|
pub struct BackupCollector {
|
||||||
pub interval: Duration,
|
pub backup_status_file: String,
|
||||||
pub restic_repo: Option<String>,
|
pub max_age_hours: u64,
|
||||||
pub backup_service: String,
|
|
||||||
pub timeout_ms: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BackupCollector {
|
impl BackupCollector {
|
||||||
pub fn new(
|
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
||||||
_enabled: bool,
|
|
||||||
interval_ms: u64,
|
|
||||||
restic_repo: Option<String>,
|
|
||||||
backup_service: String,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
Self {
|
||||||
interval: Duration::from_millis(interval_ms),
|
backup_status_file: backup_status_file
|
||||||
restic_repo,
|
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||||
backup_service,
|
max_age_hours,
|
||||||
timeout_ms: 30000, // 30 second timeout for backup operations
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_borgbackup_metrics(&self) -> Result<BorgbackupMetrics, CollectorError> {
|
async fn read_backup_status(&self) -> Result<Option<BackupStatusToml>, CollectorError> {
|
||||||
// Read metrics from the borgbackup JSON file
|
// Check if backup status file exists
|
||||||
let metrics_path = "/var/lib/backup/backup-metrics.json";
|
if !std::path::Path::new(&self.backup_status_file).exists() {
|
||||||
|
return Ok(None); // File doesn't exist, but this is not an error
|
||||||
let content = fs::read_to_string(metrics_path)
|
}
|
||||||
|
|
||||||
|
let content = fs::read_to_string(&self.backup_status_file)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| CollectorError::IoError {
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
message: format!("Failed to read backup metrics file: {}", e),
|
path: self.backup_status_file.clone(),
|
||||||
})?;
|
error: e.to_string(),
|
||||||
|
|
||||||
let metrics: BorgbackupMetrics = serde_json::from_str(&content)
|
|
||||||
.map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse backup metrics JSON: {}", e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(metrics)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_restic_snapshots(&self) -> Result<ResticStats, CollectorError> {
|
|
||||||
let repo = self
|
|
||||||
.restic_repo
|
|
||||||
.as_ref()
|
|
||||||
.ok_or_else(|| CollectorError::ConfigError {
|
|
||||||
message: "No restic repository configured".to_string(),
|
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let timeout_duration = Duration::from_millis(self.timeout_ms);
|
let backup_status = toml::from_str(&content).map_err(|e| CollectorError::Parse {
|
||||||
|
value: "backup status TOML".to_string(),
|
||||||
// Get restic snapshots
|
error: e.to_string(),
|
||||||
let output = timeout(
|
|
||||||
timeout_duration,
|
|
||||||
Command::new("restic")
|
|
||||||
.args(["-r", repo, "snapshots", "--json"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|_| CollectorError::Timeout {
|
|
||||||
duration_ms: self.timeout_ms,
|
|
||||||
})?
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("restic -r {} snapshots --json", repo),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
if !output.status.success() {
|
Ok(Some(backup_status))
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: format!("restic -r {} snapshots --json", repo),
|
|
||||||
message: stderr.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let snapshots: Vec<ResticSnapshot> =
|
|
||||||
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse restic snapshots: {}", e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// Get repository stats
|
|
||||||
let stats_output = timeout(
|
|
||||||
timeout_duration,
|
|
||||||
Command::new("restic")
|
|
||||||
.args(["-r", repo, "stats", "--json"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|_| CollectorError::Timeout {
|
|
||||||
duration_ms: self.timeout_ms,
|
|
||||||
})?
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("restic -r {} stats --json", repo),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let repo_size_gb = if stats_output.status.success() {
|
|
||||||
let stats_stdout = String::from_utf8_lossy(&stats_output.stdout);
|
|
||||||
let stats: Result<ResticStats, _> = serde_json::from_str(&stats_stdout);
|
|
||||||
stats
|
|
||||||
.ok()
|
|
||||||
.map(|s| s.total_size as f32 / (1024.0 * 1024.0 * 1024.0))
|
|
||||||
.unwrap_or(0.0)
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
|
|
||||||
// Find most recent snapshot
|
|
||||||
let last_success = snapshots.iter().map(|s| s.time).max();
|
|
||||||
|
|
||||||
Ok(ResticStats {
|
|
||||||
total_size: (repo_size_gb * 1024.0 * 1024.0 * 1024.0) as u64,
|
|
||||||
snapshot_count: snapshots.len() as u32,
|
|
||||||
last_success,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_backup_service_status(&self) -> Result<BackupServiceData, CollectorError> {
|
fn calculate_backup_status(&self, backup_status: &BackupStatusToml) -> Status {
|
||||||
let timeout_duration = Duration::from_millis(self.timeout_ms);
|
// Parse the start time to check age - handle both RFC3339 and local timestamp formats
|
||||||
|
let start_time = match chrono::DateTime::parse_from_rfc3339(&backup_status.start_time) {
|
||||||
// Get systemctl status for backup service
|
Ok(dt) => dt.with_timezone(&Utc),
|
||||||
let status_output = timeout(
|
|
||||||
timeout_duration,
|
|
||||||
Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args([
|
|
||||||
"show",
|
|
||||||
&self.backup_service,
|
|
||||||
"--property=ActiveState,SubState,MainPID",
|
|
||||||
])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|_| CollectorError::Timeout {
|
|
||||||
duration_ms: self.timeout_ms,
|
|
||||||
})?
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("systemctl show {}", self.backup_service),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let enabled = if status_output.status.success() {
|
|
||||||
let status_stdout = String::from_utf8_lossy(&status_output.stdout);
|
|
||||||
status_stdout.contains("ActiveState=active")
|
|
||||||
|| status_stdout.contains("SubState=running")
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
};
|
|
||||||
|
|
||||||
// Check for backup timer or service logs for last message
|
|
||||||
let last_message = self.get_last_backup_log_message().await.ok();
|
|
||||||
|
|
||||||
// Check for pending backup jobs (simplified - could check systemd timers)
|
|
||||||
let pending_jobs = 0; // TODO: Implement proper pending job detection
|
|
||||||
|
|
||||||
Ok(BackupServiceData {
|
|
||||||
enabled,
|
|
||||||
pending_jobs,
|
|
||||||
last_message,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_last_backup_log_message(&self) -> Result<String, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/journalctl")
|
|
||||||
.args([
|
|
||||||
"-u",
|
|
||||||
&self.backup_service,
|
|
||||||
"--lines=1",
|
|
||||||
"--no-pager",
|
|
||||||
"--output=cat",
|
|
||||||
])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("journalctl -u {} --lines=1", self.backup_service),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let message = stdout.trim().to_string();
|
|
||||||
if !message.is_empty() {
|
|
||||||
return Ok(message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: "No log messages found".to_string(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_backup_logs_for_failures(&self) -> Result<Option<DateTime<Utc>>, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/journalctl")
|
|
||||||
.args([
|
|
||||||
"-u",
|
|
||||||
&self.backup_service,
|
|
||||||
"--since",
|
|
||||||
"1 week ago",
|
|
||||||
"--grep=failed\\|error\\|ERROR",
|
|
||||||
"--output=json",
|
|
||||||
"--lines=1",
|
|
||||||
])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!(
|
|
||||||
"journalctl -u {} --since='1 week ago' --grep=failed",
|
|
||||||
self.backup_service
|
|
||||||
),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
if let Ok(log_entry) = serde_json::from_str::<JournalEntry>(&stdout) {
|
|
||||||
if let Ok(timestamp) = log_entry.realtime_timestamp.parse::<i64>() {
|
|
||||||
let dt =
|
|
||||||
DateTime::from_timestamp_micros(timestamp).unwrap_or_else(|| Utc::now());
|
|
||||||
return Ok(Some(dt));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_backup_status(
|
|
||||||
&self,
|
|
||||||
restic_stats: &Result<ResticStats, CollectorError>,
|
|
||||||
service_data: &BackupServiceData,
|
|
||||||
last_failure: Option<DateTime<Utc>>,
|
|
||||||
) -> BackupStatus {
|
|
||||||
match restic_stats {
|
|
||||||
Ok(stats) => {
|
|
||||||
if let Some(last_success) = stats.last_success {
|
|
||||||
let hours_since_backup =
|
|
||||||
Utc::now().signed_duration_since(last_success).num_hours();
|
|
||||||
|
|
||||||
if hours_since_backup > 48 {
|
|
||||||
BackupStatus::Warning // More than 2 days since last backup
|
|
||||||
} else if let Some(failure) = last_failure {
|
|
||||||
if failure > last_success {
|
|
||||||
BackupStatus::Failed // Failure after last success
|
|
||||||
} else {
|
|
||||||
BackupStatus::Healthy
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
BackupStatus::Healthy
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
BackupStatus::Warning // No successful backups found
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
if service_data.enabled {
|
// Try parsing as naive datetime and assume UTC
|
||||||
BackupStatus::Failed // Service enabled but can't access repo
|
match chrono::NaiveDateTime::parse_from_str(
|
||||||
} else {
|
&backup_status.start_time,
|
||||||
BackupStatus::Unknown // Service disabled
|
"%Y-%m-%dT%H:%M:%S%.f",
|
||||||
|
) {
|
||||||
|
Ok(naive_dt) => naive_dt.and_utc(),
|
||||||
|
Err(_) => {
|
||||||
|
error!(
|
||||||
|
"Failed to parse backup timestamp: {}",
|
||||||
|
backup_status.start_time
|
||||||
|
);
|
||||||
|
return Status::Unknown;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let hours_since_backup = Utc::now().signed_duration_since(start_time).num_hours();
|
||||||
|
|
||||||
|
// Check overall backup status
|
||||||
|
match backup_status.status.as_str() {
|
||||||
|
"success" => {
|
||||||
|
if hours_since_backup > self.max_age_hours as i64 {
|
||||||
|
Status::Warning // Backup too old
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"failed" => Status::Critical,
|
||||||
|
"running" => Status::Ok, // Currently running is OK
|
||||||
|
_ => Status::Unknown,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn calculate_service_status(&self, service: &ServiceStatus) -> Status {
|
||||||
|
match service.status.as_str() {
|
||||||
|
"completed" => {
|
||||||
|
if service.exit_code == 0 {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"failed" => Status::Critical,
|
||||||
|
"disabled" => Status::Warning, // Service intentionally disabled
|
||||||
|
"running" => Status::Ok,
|
||||||
|
_ => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bytes_to_gb(bytes: u64) -> f32 {
|
||||||
|
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Collector for BackupCollector {
|
impl Collector for BackupCollector {
|
||||||
fn name(&self) -> &str {
|
|
||||||
"backup"
|
|
||||||
}
|
|
||||||
|
|
||||||
fn agent_type(&self) -> AgentType {
|
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
AgentType::Backup
|
let backup_status_option = self.read_backup_status().await?;
|
||||||
}
|
let mut metrics = Vec::new();
|
||||||
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
fn collect_interval(&self) -> Duration {
|
// If no backup status file exists, return minimal metrics indicating no backup system
|
||||||
self.interval
|
let backup_status = match backup_status_option {
|
||||||
}
|
Some(status) => status,
|
||||||
|
None => {
|
||||||
|
// No backup system configured - return minimal "unknown" metrics
|
||||||
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
metrics.push(Metric {
|
||||||
// Try to get borgbackup metrics first, fall back to restic if not available
|
name: "backup_overall_status".to_string(),
|
||||||
let borgbackup_result = self.get_borgbackup_metrics().await;
|
value: MetricValue::String("no_backup_system".to_string()),
|
||||||
|
status: Status::Unknown,
|
||||||
let (backup_info, overall_status) = match &borgbackup_result {
|
timestamp,
|
||||||
Ok(borg_metrics) => {
|
description: Some("No backup system configured (no status file found)".to_string()),
|
||||||
// Parse borgbackup timestamp to DateTime
|
unit: None,
|
||||||
let last_success = chrono::DateTime::from_timestamp(borg_metrics.timestamp, 0);
|
});
|
||||||
|
return Ok(metrics);
|
||||||
// Determine status from borgbackup data
|
|
||||||
let status = match borg_metrics.status.as_str() {
|
|
||||||
"success" => BackupStatus::Healthy,
|
|
||||||
"warning" => BackupStatus::Warning,
|
|
||||||
"failed" => BackupStatus::Failed,
|
|
||||||
_ => BackupStatus::Unknown,
|
|
||||||
};
|
|
||||||
|
|
||||||
let backup_info = BackupInfo {
|
|
||||||
last_success,
|
|
||||||
last_failure: None, // borgbackup metrics don't include failure info
|
|
||||||
size_gb: borg_metrics.repository.total_repository_size_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
|
||||||
latest_archive_size_gb: Some(borg_metrics.repository.latest_archive_size_bytes as f32 / (1024.0 * 1024.0 * 1024.0)),
|
|
||||||
snapshot_count: borg_metrics.repository.total_archives as u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
(backup_info, status)
|
|
||||||
},
|
|
||||||
Err(_) => {
|
|
||||||
// Fall back to restic if borgbackup metrics not available
|
|
||||||
let restic_stats = self.get_restic_snapshots().await;
|
|
||||||
let last_failure = self.get_backup_logs_for_failures().await.unwrap_or(None);
|
|
||||||
|
|
||||||
// Get backup service status for fallback determination
|
|
||||||
let service_data = self
|
|
||||||
.get_backup_service_status()
|
|
||||||
.await
|
|
||||||
.unwrap_or(BackupServiceData {
|
|
||||||
enabled: false,
|
|
||||||
pending_jobs: 0,
|
|
||||||
last_message: None,
|
|
||||||
});
|
|
||||||
|
|
||||||
let overall_status = self.determine_backup_status(&restic_stats, &service_data, last_failure);
|
|
||||||
|
|
||||||
let backup_info = match &restic_stats {
|
|
||||||
Ok(stats) => BackupInfo {
|
|
||||||
last_success: stats.last_success,
|
|
||||||
last_failure,
|
|
||||||
size_gb: stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0),
|
|
||||||
latest_archive_size_gb: None, // Restic doesn't provide this easily
|
|
||||||
snapshot_count: stats.snapshot_count,
|
|
||||||
},
|
|
||||||
Err(_) => BackupInfo {
|
|
||||||
last_success: None,
|
|
||||||
last_failure,
|
|
||||||
size_gb: 0.0,
|
|
||||||
latest_archive_size_gb: None,
|
|
||||||
snapshot_count: 0,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
(backup_info, overall_status)
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get backup service status
|
// Overall backup status
|
||||||
let service_data = self
|
let overall_status = self.calculate_backup_status(&backup_status);
|
||||||
.get_backup_service_status()
|
metrics.push(Metric {
|
||||||
.await
|
name: "backup_overall_status".to_string(),
|
||||||
.unwrap_or(BackupServiceData {
|
value: MetricValue::String(match overall_status {
|
||||||
enabled: false,
|
Status::Ok => "ok".to_string(),
|
||||||
pending_jobs: 0,
|
Status::Pending => "pending".to_string(),
|
||||||
last_message: None,
|
Status::Warning => "warning".to_string(),
|
||||||
|
Status::Critical => "critical".to_string(),
|
||||||
|
Status::Unknown => "unknown".to_string(),
|
||||||
|
}),
|
||||||
|
status: overall_status,
|
||||||
|
timestamp,
|
||||||
|
description: Some(format!(
|
||||||
|
"Backup: {} at {}",
|
||||||
|
backup_status.status, backup_status.start_time
|
||||||
|
)),
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Backup duration
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_duration_seconds".to_string(),
|
||||||
|
value: MetricValue::Integer(backup_status.duration_seconds),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Duration of last backup run".to_string()),
|
||||||
|
unit: Some("seconds".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
||||||
|
let last_updated_dt_result =
|
||||||
|
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||||
|
.map(|dt| dt.with_timezone(&Utc))
|
||||||
|
.or_else(|_| {
|
||||||
|
// Try parsing as naive datetime and assume UTC
|
||||||
|
chrono::NaiveDateTime::parse_from_str(
|
||||||
|
&backup_status.last_updated,
|
||||||
|
"%Y-%m-%dT%H:%M:%S%.f",
|
||||||
|
)
|
||||||
|
.map(|naive_dt| naive_dt.and_utc())
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Ok(last_updated_dt) = last_updated_dt_result {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_last_run_timestamp".to_string(),
|
||||||
|
value: MetricValue::Integer(last_updated_dt.timestamp()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Timestamp of last backup completion".to_string()),
|
||||||
|
unit: Some("unix_timestamp".to_string()),
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
error!(
|
||||||
|
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
||||||
|
backup_status.last_updated
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Individual service metrics
|
||||||
|
for (service_name, service) in &backup_status.services {
|
||||||
|
let service_status = self.calculate_service_status(service);
|
||||||
|
|
||||||
|
// Service status
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("backup_service_{}_status", service_name),
|
||||||
|
value: MetricValue::String(match service_status {
|
||||||
|
Status::Ok => "ok".to_string(),
|
||||||
|
Status::Pending => "pending".to_string(),
|
||||||
|
Status::Warning => "warning".to_string(),
|
||||||
|
Status::Critical => "critical".to_string(),
|
||||||
|
Status::Unknown => "unknown".to_string(),
|
||||||
|
}),
|
||||||
|
status: service_status,
|
||||||
|
timestamp,
|
||||||
|
description: Some(format!(
|
||||||
|
"Backup service {} status: {}",
|
||||||
|
service_name, service.status
|
||||||
|
)),
|
||||||
|
unit: None,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Convert BackupStatus to standardized string format
|
// Service exit code
|
||||||
let status_string = match overall_status {
|
metrics.push(Metric {
|
||||||
BackupStatus::Healthy => "ok",
|
name: format!("backup_service_{}_exit_code", service_name),
|
||||||
BackupStatus::Warning => "warning",
|
value: MetricValue::Integer(service.exit_code),
|
||||||
BackupStatus::Failed => "critical",
|
status: if service.exit_code == 0 {
|
||||||
BackupStatus::Unknown => "unknown",
|
Status::Ok
|
||||||
};
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
description: Some(format!("Exit code for backup service {}", service_name)),
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
|
||||||
// Add disk information if available from borgbackup metrics
|
// Repository archive count
|
||||||
let mut backup_json = json!({
|
metrics.push(Metric {
|
||||||
"overall_status": status_string,
|
name: format!("backup_service_{}_archive_count", service_name),
|
||||||
"backup": backup_info,
|
value: MetricValue::Integer(service.archive_count),
|
||||||
"service": service_data,
|
status: Status::Ok,
|
||||||
"timestamp": Utc::now()
|
timestamp,
|
||||||
});
|
description: Some(format!("Number of archives in {} repository", service_name)),
|
||||||
|
unit: Some("archives".to_string()),
|
||||||
// If we got borgbackup metrics, include disk information
|
});
|
||||||
if let Ok(borg_metrics) = &borgbackup_result {
|
|
||||||
backup_json["disk"] = json!({
|
// Repository size in GB
|
||||||
"device": borg_metrics.backup_disk.device,
|
let repo_size_gb = Self::bytes_to_gb(service.repo_size_bytes);
|
||||||
"health": borg_metrics.backup_disk.health,
|
metrics.push(Metric {
|
||||||
"total_gb": borg_metrics.backup_disk.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
name: format!("backup_service_{}_repo_size_gb", service_name),
|
||||||
"used_gb": borg_metrics.backup_disk.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
value: MetricValue::Float(repo_size_gb),
|
||||||
"usage_percent": borg_metrics.backup_disk.usage_percent
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some(format!("Repository size for {} in GB", service_name)),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Repository path for reference
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("backup_service_{}_repo_path", service_name),
|
||||||
|
value: MetricValue::String(service.repo_path.clone()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some(format!("Repository path for {}", service_name)),
|
||||||
|
unit: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let backup_metrics = backup_json;
|
// Total number of services
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_total_services".to_string(),
|
||||||
|
value: MetricValue::Integer(backup_status.services.len() as i64),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Total number of backup services".to_string()),
|
||||||
|
unit: Some("services".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
Ok(CollectorOutput {
|
// Calculate total repository size
|
||||||
agent_type: AgentType::Backup,
|
let total_size_bytes: u64 = backup_status
|
||||||
data: backup_metrics,
|
.services
|
||||||
})
|
.values()
|
||||||
|
.map(|s| s.repo_size_bytes)
|
||||||
|
.sum();
|
||||||
|
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_total_repo_size_gb".to_string(),
|
||||||
|
value: MetricValue::Float(total_size_gb),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Total size of all backup repositories".to_string()),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Disk space metrics for backup directory
|
||||||
|
if let Some(ref disk_space) = backup_status.disk_space {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_total_gb".to_string(),
|
||||||
|
value: MetricValue::Float(disk_space.total_gb as f32),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Total disk space available for backups".to_string()),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_used_gb".to_string(),
|
||||||
|
value: MetricValue::Float(disk_space.used_gb as f32),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Used disk space on backup drive".to_string()),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_available_gb".to_string(),
|
||||||
|
value: MetricValue::Float(disk_space.available_gb as f32),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Available disk space on backup drive".to_string()),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_usage_percent".to_string(),
|
||||||
|
value: MetricValue::Float(disk_space.usage_percent as f32),
|
||||||
|
status: if disk_space.usage_percent >= 95.0 {
|
||||||
|
Status::Critical
|
||||||
|
} else if disk_space.usage_percent >= 85.0 {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
description: Some("Backup disk usage percentage".to_string()),
|
||||||
|
unit: Some("percent".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add disk identification metrics if available from disk_space
|
||||||
|
if let Some(ref product_name) = disk_space.product_name {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_product_name".to_string(),
|
||||||
|
value: MetricValue::String(product_name.clone()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Backup disk product name from SMART data".to_string()),
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(ref serial_number) = disk_space.serial_number {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_serial_number".to_string(),
|
||||||
|
value: MetricValue::String(serial_number.clone()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Backup disk serial number from SMART data".to_string()),
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add standalone disk identification metrics from TOML fields
|
||||||
|
if let Some(ref product_name) = backup_status.disk_product_name {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_product_name".to_string(),
|
||||||
|
value: MetricValue::String(product_name.clone()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Backup disk product name from SMART data".to_string()),
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(ref serial_number) = backup_status.disk_serial_number {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "backup_disk_serial_number".to_string(),
|
||||||
|
value: MetricValue::String(serial_number.clone()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some("Backup disk serial number from SMART data".to_string()),
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count services by status
|
||||||
|
let mut status_counts = HashMap::new();
|
||||||
|
for service in backup_status.services.values() {
|
||||||
|
*status_counts.entry(service.status.clone()).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (status_name, count) in status_counts {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("backup_services_{}_count", status_name),
|
||||||
|
value: MetricValue::Integer(count),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
description: Some(format!("Number of services with status: {}", status_name)),
|
||||||
|
unit: Some("services".to_string()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(metrics)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
/// TOML structure for backup status file
|
||||||
struct ResticSnapshot {
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
time: DateTime<Utc>,
|
pub struct BackupStatusToml {
|
||||||
|
pub backup_name: String,
|
||||||
|
pub start_time: String,
|
||||||
|
pub current_time: String,
|
||||||
|
pub duration_seconds: i64,
|
||||||
|
pub status: String,
|
||||||
|
pub last_updated: String,
|
||||||
|
pub disk_space: Option<DiskSpace>,
|
||||||
|
pub disk_product_name: Option<String>,
|
||||||
|
pub disk_serial_number: Option<String>,
|
||||||
|
pub services: HashMap<String, ServiceStatus>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
struct ResticStats {
|
pub struct DiskSpace {
|
||||||
total_size: u64,
|
pub total_bytes: u64,
|
||||||
snapshot_count: u32,
|
pub used_bytes: u64,
|
||||||
last_success: Option<DateTime<Utc>>,
|
pub available_bytes: u64,
|
||||||
|
pub total_gb: f64,
|
||||||
|
pub used_gb: f64,
|
||||||
|
pub available_gb: f64,
|
||||||
|
pub usage_percent: f64,
|
||||||
|
// Optional disk identification fields
|
||||||
|
pub product_name: Option<String>,
|
||||||
|
pub serial_number: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
struct BackupServiceData {
|
pub struct ServiceStatus {
|
||||||
enabled: bool,
|
pub status: String,
|
||||||
pending_jobs: u32,
|
pub exit_code: i64,
|
||||||
last_message: Option<String>,
|
pub repo_path: String,
|
||||||
}
|
pub archive_count: i64,
|
||||||
|
pub repo_size_bytes: u64,
|
||||||
#[derive(Debug, Serialize)]
|
|
||||||
struct BackupInfo {
|
|
||||||
last_success: Option<DateTime<Utc>>,
|
|
||||||
last_failure: Option<DateTime<Utc>>,
|
|
||||||
size_gb: f32,
|
|
||||||
latest_archive_size_gb: Option<f32>,
|
|
||||||
snapshot_count: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
|
||||||
enum BackupStatus {
|
|
||||||
Healthy,
|
|
||||||
Warning,
|
|
||||||
Failed,
|
|
||||||
Unknown,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct JournalEntry {
|
|
||||||
#[serde(rename = "__REALTIME_TIMESTAMP")]
|
|
||||||
realtime_timestamp: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Borgbackup metrics structure from backup script
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct BorgbackupMetrics {
|
|
||||||
status: String,
|
|
||||||
repository: Repository,
|
|
||||||
backup_disk: BackupDisk,
|
|
||||||
timestamp: i64,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct Repository {
|
|
||||||
total_archives: i32,
|
|
||||||
latest_archive_size_bytes: i64,
|
|
||||||
total_repository_size_bytes: i64,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct BackupDisk {
|
|
||||||
device: String,
|
|
||||||
health: String,
|
|
||||||
total_bytes: i64,
|
|
||||||
used_bytes: i64,
|
|
||||||
usage_percent: f32,
|
|
||||||
}
|
}
|
||||||
|
|||||||
239
agent/src/collectors/cpu.rs
Normal file
239
agent/src/collectors/cpu.rs
Normal file
@@ -0,0 +1,239 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||||
|
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{utils, Collector, CollectorError};
|
||||||
|
use crate::config::CpuConfig;
|
||||||
|
|
||||||
|
/// Extremely efficient CPU metrics collector
|
||||||
|
///
|
||||||
|
/// EFFICIENCY OPTIMIZATIONS:
|
||||||
|
/// - Single /proc/loadavg read for all load metrics
|
||||||
|
/// - Single /proc/stat read for CPU usage
|
||||||
|
/// - Minimal string allocations
|
||||||
|
/// - No process spawning
|
||||||
|
/// - <0.1ms collection time target
|
||||||
|
pub struct CpuCollector {
|
||||||
|
load_thresholds: HysteresisThresholds,
|
||||||
|
temperature_thresholds: HysteresisThresholds,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CpuCollector {
|
||||||
|
pub fn new(config: CpuConfig) -> Self {
|
||||||
|
// Create hysteresis thresholds with 10% gap for recovery
|
||||||
|
let load_thresholds = HysteresisThresholds::new(
|
||||||
|
config.load_warning_threshold,
|
||||||
|
config.load_critical_threshold,
|
||||||
|
);
|
||||||
|
|
||||||
|
let temperature_thresholds = HysteresisThresholds::new(
|
||||||
|
config.temperature_warning_threshold,
|
||||||
|
config.temperature_critical_threshold,
|
||||||
|
);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
load_thresholds,
|
||||||
|
temperature_thresholds,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate CPU load status using hysteresis thresholds
|
||||||
|
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
|
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate CPU temperature status using hysteresis thresholds
|
||||||
|
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
|
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect CPU load averages from /proc/loadavg
|
||||||
|
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||||
|
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||||
|
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||||
|
|
||||||
|
if parts.len() < 3 {
|
||||||
|
return Err(CollectorError::Parse {
|
||||||
|
value: content,
|
||||||
|
error: "Expected at least 3 values in /proc/loadavg".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let load_1min = utils::parse_f32(parts[0])?;
|
||||||
|
let load_5min = utils::parse_f32(parts[1])?;
|
||||||
|
let load_15min = utils::parse_f32(parts[2])?;
|
||||||
|
|
||||||
|
// Only apply thresholds to 5-minute load average
|
||||||
|
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||||
|
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
|
||||||
|
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||||
|
|
||||||
|
Ok(vec![
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_LOAD_1MIN.to_string(),
|
||||||
|
MetricValue::Float(load_1min),
|
||||||
|
load_1min_status,
|
||||||
|
)
|
||||||
|
.with_description("CPU load average over 1 minute".to_string()),
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_LOAD_5MIN.to_string(),
|
||||||
|
MetricValue::Float(load_5min),
|
||||||
|
load_5min_status,
|
||||||
|
)
|
||||||
|
.with_description("CPU load average over 5 minutes".to_string()),
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_LOAD_15MIN.to_string(),
|
||||||
|
MetricValue::Float(load_15min),
|
||||||
|
load_15min_status,
|
||||||
|
)
|
||||||
|
.with_description("CPU load average over 15 minutes".to_string()),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect CPU temperature from thermal zones
|
||||||
|
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||||
|
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
|
||||||
|
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||||
|
if let Ok(temp) = self
|
||||||
|
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
|
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||||
|
|
||||||
|
return Ok(Some(
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||||
|
MetricValue::Float(temp_celsius),
|
||||||
|
status,
|
||||||
|
)
|
||||||
|
.with_description("CPU package temperature".to_string())
|
||||||
|
.with_unit("°C".to_string()),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: try other thermal zones
|
||||||
|
for zone_id in 0..10 {
|
||||||
|
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||||
|
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||||
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
|
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||||
|
|
||||||
|
return Ok(Some(
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||||
|
MetricValue::Float(temp_celsius),
|
||||||
|
status,
|
||||||
|
)
|
||||||
|
.with_description(format!("CPU temperature from thermal_zone{}", zone_id))
|
||||||
|
.with_unit("°C".to_string()),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("No CPU temperature sensors found");
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read temperature from thermal zone efficiently
|
||||||
|
async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
|
||||||
|
let content = utils::read_proc_file(path)?;
|
||||||
|
utils::parse_u64(content.trim())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
|
||||||
|
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
|
||||||
|
// Try scaling frequency first (more accurate for current frequency)
|
||||||
|
if let Ok(freq) =
|
||||||
|
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
|
||||||
|
{
|
||||||
|
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
||||||
|
let freq_mhz = freq_khz as f32 / 1000.0;
|
||||||
|
|
||||||
|
return Ok(Some(
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||||
|
MetricValue::Float(freq_mhz),
|
||||||
|
Status::Ok, // Frequency doesn't have status thresholds
|
||||||
|
)
|
||||||
|
.with_description("Current CPU frequency".to_string())
|
||||||
|
.with_unit("MHz".to_string()),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: parse /proc/cpuinfo for base frequency
|
||||||
|
if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") {
|
||||||
|
for line in content.lines() {
|
||||||
|
if line.starts_with("cpu MHz") {
|
||||||
|
if let Some(freq_str) = line.split(':').nth(1) {
|
||||||
|
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
||||||
|
return Ok(Some(
|
||||||
|
Metric::new(
|
||||||
|
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||||
|
MetricValue::Float(freq_mhz),
|
||||||
|
Status::Ok,
|
||||||
|
)
|
||||||
|
.with_description(
|
||||||
|
"CPU base frequency from /proc/cpuinfo".to_string(),
|
||||||
|
)
|
||||||
|
.with_unit("MHz".to_string()),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break; // Only need first CPU entry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("CPU frequency not available");
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for CpuCollector {
|
||||||
|
|
||||||
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
debug!("Collecting CPU metrics");
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
|
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||||
|
|
||||||
|
// Collect load averages (always available)
|
||||||
|
metrics.extend(self.collect_load_averages(status_tracker).await?);
|
||||||
|
|
||||||
|
// Collect temperature (optional)
|
||||||
|
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
|
||||||
|
metrics.push(temp_metric);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect frequency (optional)
|
||||||
|
if let Some(freq_metric) = self.collect_frequency().await? {
|
||||||
|
metrics.push(freq_metric);
|
||||||
|
}
|
||||||
|
|
||||||
|
let duration = start.elapsed();
|
||||||
|
debug!(
|
||||||
|
"CPU collection completed in {:?} with {} metrics",
|
||||||
|
duration,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Efficiency check: warn if collection takes too long
|
||||||
|
if duration.as_millis() > 1 {
|
||||||
|
debug!(
|
||||||
|
"CPU collection took {}ms - consider optimization",
|
||||||
|
duration.as_millis()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store performance metrics
|
||||||
|
// Performance tracking handled by cache system
|
||||||
|
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
596
agent/src/collectors/disk.rs
Normal file
596
agent/src/collectors/disk.rs
Normal file
@@ -0,0 +1,596 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||||
|
|
||||||
|
use crate::config::DiskConfig;
|
||||||
|
use std::process::Command;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{Collector, CollectorError};
|
||||||
|
|
||||||
|
/// Information about a storage pool (mount point with underlying drives)
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct StoragePool {
|
||||||
|
name: String, // e.g., "steampool", "root"
|
||||||
|
mount_point: String, // e.g., "/mnt/steampool", "/"
|
||||||
|
filesystem: String, // e.g., "mergerfs", "ext4", "zfs", "btrfs"
|
||||||
|
storage_type: String, // e.g., "mergerfs", "single", "raid", "zfs"
|
||||||
|
size: String, // e.g., "2.5TB"
|
||||||
|
used: String, // e.g., "2.1TB"
|
||||||
|
available: String, // e.g., "400GB"
|
||||||
|
usage_percent: f32, // e.g., 85.0
|
||||||
|
underlying_drives: Vec<DriveInfo>, // Individual physical drives
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Information about an individual physical drive
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct DriveInfo {
|
||||||
|
device: String, // e.g., "sda", "nvme0n1"
|
||||||
|
health_status: String, // e.g., "PASSED", "FAILED"
|
||||||
|
temperature: Option<f32>, // e.g., 45.0°C
|
||||||
|
wear_level: Option<f32>, // e.g., 12.0% (for SSDs)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Disk usage collector for monitoring filesystem sizes
|
||||||
|
pub struct DiskCollector {
|
||||||
|
config: DiskConfig,
|
||||||
|
temperature_thresholds: HysteresisThresholds,
|
||||||
|
detected_devices: std::collections::HashMap<String, Vec<String>>, // mount_point -> devices
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DiskCollector {
|
||||||
|
pub fn new(config: DiskConfig) -> Self {
|
||||||
|
// Create hysteresis thresholds for disk temperature from config
|
||||||
|
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||||
|
config.temperature_warning_celsius,
|
||||||
|
5.0, // 5°C gap for recovery
|
||||||
|
config.temperature_critical_celsius,
|
||||||
|
5.0, // 5°C gap for recovery
|
||||||
|
);
|
||||||
|
|
||||||
|
// Detect devices for all configured filesystems at startup
|
||||||
|
let mut detected_devices = std::collections::HashMap::new();
|
||||||
|
for fs_config in &config.filesystems {
|
||||||
|
if fs_config.monitor {
|
||||||
|
if let Ok(devices) = Self::detect_device_for_mount_point_static(&fs_config.mount_point) {
|
||||||
|
detected_devices.insert(fs_config.mount_point.clone(), devices);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self {
|
||||||
|
config,
|
||||||
|
temperature_thresholds,
|
||||||
|
detected_devices,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate disk temperature status using hysteresis thresholds
|
||||||
|
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
|
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Get configured storage pools with individual drive information
|
||||||
|
fn get_configured_storage_pools(&self) -> Result<Vec<StoragePool>> {
|
||||||
|
let mut storage_pools = Vec::new();
|
||||||
|
|
||||||
|
for fs_config in &self.config.filesystems {
|
||||||
|
if !fs_config.monitor {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get filesystem stats for the mount point
|
||||||
|
match self.get_filesystem_info(&fs_config.mount_point) {
|
||||||
|
Ok((total_bytes, used_bytes)) => {
|
||||||
|
let available_bytes = total_bytes - used_bytes;
|
||||||
|
let usage_percent = if total_bytes > 0 {
|
||||||
|
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
// Convert bytes to human-readable format
|
||||||
|
let size = self.bytes_to_human_readable(total_bytes);
|
||||||
|
let used = self.bytes_to_human_readable(used_bytes);
|
||||||
|
let available = self.bytes_to_human_readable(available_bytes);
|
||||||
|
|
||||||
|
// Get individual drive information using pre-detected devices
|
||||||
|
let device_names = self.detected_devices.get(&fs_config.mount_point).cloned().unwrap_or_default();
|
||||||
|
let underlying_drives = self.get_drive_info_for_devices(&device_names)?;
|
||||||
|
|
||||||
|
storage_pools.push(StoragePool {
|
||||||
|
name: fs_config.name.clone(),
|
||||||
|
mount_point: fs_config.mount_point.clone(),
|
||||||
|
filesystem: fs_config.fs_type.clone(),
|
||||||
|
storage_type: fs_config.storage_type.clone(),
|
||||||
|
size,
|
||||||
|
used,
|
||||||
|
available,
|
||||||
|
usage_percent: usage_percent as f32,
|
||||||
|
underlying_drives,
|
||||||
|
});
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Storage pool '{}' ({}) at {} with {} detected drives",
|
||||||
|
fs_config.name, fs_config.storage_type, fs_config.mount_point, device_names.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!(
|
||||||
|
"Failed to get filesystem info for storage pool '{}': {}",
|
||||||
|
fs_config.name, e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(storage_pools)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get drive information for a list of device names
|
||||||
|
fn get_drive_info_for_devices(&self, device_names: &[String]) -> Result<Vec<DriveInfo>> {
|
||||||
|
let mut drives = Vec::new();
|
||||||
|
|
||||||
|
for device_name in device_names {
|
||||||
|
let device_path = format!("/dev/{}", device_name);
|
||||||
|
|
||||||
|
// Get SMART data for this drive
|
||||||
|
let (health_status, temperature, wear_level) = self.get_smart_data(&device_path);
|
||||||
|
|
||||||
|
drives.push(DriveInfo {
|
||||||
|
device: device_name.clone(),
|
||||||
|
health_status: health_status.clone(),
|
||||||
|
temperature,
|
||||||
|
wear_level,
|
||||||
|
});
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Drive info for {}: health={}, temp={:?}°C, wear={:?}%",
|
||||||
|
device_name, health_status, temperature, wear_level
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(drives)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get SMART data for a drive (health, temperature, wear level)
|
||||||
|
fn get_smart_data(&self, device_path: &str) -> (String, Option<f32>, Option<f32>) {
|
||||||
|
// Try to get SMART data using smartctl
|
||||||
|
let output = Command::new("sudo")
|
||||||
|
.arg("smartctl")
|
||||||
|
.arg("-a")
|
||||||
|
.arg(device_path)
|
||||||
|
.output();
|
||||||
|
|
||||||
|
match output {
|
||||||
|
Ok(result) if result.status.success() => {
|
||||||
|
let stdout = String::from_utf8_lossy(&result.stdout);
|
||||||
|
|
||||||
|
// Parse health status
|
||||||
|
let health = if stdout.contains("PASSED") {
|
||||||
|
"PASSED".to_string()
|
||||||
|
} else if stdout.contains("FAILED") {
|
||||||
|
"FAILED".to_string()
|
||||||
|
} else {
|
||||||
|
"UNKNOWN".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse temperature (look for various temperature indicators)
|
||||||
|
let temperature = self.parse_temperature_from_smart(&stdout);
|
||||||
|
|
||||||
|
// Parse wear level (for SSDs)
|
||||||
|
let wear_level = self.parse_wear_level_from_smart(&stdout);
|
||||||
|
|
||||||
|
(health, temperature, wear_level)
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
debug!("Failed to get SMART data for {}", device_path);
|
||||||
|
("UNKNOWN".to_string(), None, None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse temperature from SMART output
|
||||||
|
fn parse_temperature_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||||
|
for line in smart_output.lines() {
|
||||||
|
// Look for temperature in various formats
|
||||||
|
if line.contains("Temperature_Celsius") || line.contains("Temperature") {
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 10 {
|
||||||
|
if let Ok(temp) = parts[9].parse::<f32>() {
|
||||||
|
return Some(temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// NVMe drives might show temperature differently
|
||||||
|
if line.contains("temperature:") {
|
||||||
|
if let Some(temp_part) = line.split("temperature:").nth(1) {
|
||||||
|
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
||||||
|
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||||
|
return Some(temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse wear level from SMART output (SSD wear leveling)
|
||||||
|
/// Supports both NVMe and SATA SSD wear indicators
|
||||||
|
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||||
|
for line in smart_output.lines() {
|
||||||
|
let line = line.trim();
|
||||||
|
|
||||||
|
// NVMe drives - direct percentage used
|
||||||
|
if line.contains("Percentage Used:") {
|
||||||
|
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
||||||
|
if let Some(wear_str) = wear_part.split('%').next() {
|
||||||
|
if let Ok(wear) = wear_str.trim().parse::<f32>() {
|
||||||
|
return Some(wear);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// SATA SSD attributes - parse SMART table format
|
||||||
|
// Format: ID ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 10 {
|
||||||
|
// SSD Life Left / Percent Lifetime Remaining (higher = less wear)
|
||||||
|
if line.contains("SSD_Life_Left") || line.contains("Percent_Lifetime_Remain") {
|
||||||
|
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
||||||
|
return Some(100.0 - remaining); // Convert remaining to used
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Media Wearout Indicator (lower = more wear, normalize to 0-100)
|
||||||
|
if line.contains("Media_Wearout_Indicator") {
|
||||||
|
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
||||||
|
return Some(100.0 - remaining); // Convert remaining to used
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wear Leveling Count (higher = less wear, but varies by manufacturer)
|
||||||
|
if line.contains("Wear_Leveling_Count") {
|
||||||
|
if let Ok(wear_count) = parts[3].parse::<f32>() { // VALUE column
|
||||||
|
// Most SSDs: 100 = new, decreases with wear
|
||||||
|
if wear_count <= 100.0 {
|
||||||
|
return Some(100.0 - wear_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total LBAs Written - calculate against typical endurance if available
|
||||||
|
// This is more complex and manufacturer-specific, so we skip for now
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert bytes to human-readable format
|
||||||
|
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
||||||
|
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
||||||
|
let mut size = bytes as f64;
|
||||||
|
let mut unit_index = 0;
|
||||||
|
|
||||||
|
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
||||||
|
size /= 1024.0;
|
||||||
|
unit_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if unit_index == 0 {
|
||||||
|
format!("{:.0}{}", size, UNITS[unit_index])
|
||||||
|
} else {
|
||||||
|
format!("{:.1}{}", size, UNITS[unit_index])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect device backing a mount point using lsblk (static version for startup)
|
||||||
|
fn detect_device_for_mount_point_static(mount_point: &str) -> Result<Vec<String>> {
|
||||||
|
let output = Command::new("lsblk")
|
||||||
|
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 2 && parts[1] == mount_point {
|
||||||
|
// Remove tree symbols and extract device name (e.g., "├─nvme0n1p2" -> "nvme0n1p2")
|
||||||
|
let device_name = parts[0]
|
||||||
|
.trim_start_matches('├')
|
||||||
|
.trim_start_matches('└')
|
||||||
|
.trim_start_matches('─')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
// Extract base device name (e.g., "nvme0n1p2" -> "nvme0n1")
|
||||||
|
if let Some(base_device) = Self::extract_base_device(device_name) {
|
||||||
|
return Ok(vec![base_device]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Vec::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract base device name from partition (e.g., "nvme0n1p2" -> "nvme0n1", "sda1" -> "sda")
|
||||||
|
fn extract_base_device(device_name: &str) -> Option<String> {
|
||||||
|
// Handle NVMe devices (nvme0n1p1 -> nvme0n1)
|
||||||
|
if device_name.starts_with("nvme") {
|
||||||
|
if let Some(p_pos) = device_name.find('p') {
|
||||||
|
return Some(device_name[..p_pos].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle traditional devices (sda1 -> sda)
|
||||||
|
if device_name.len() > 1 {
|
||||||
|
let chars: Vec<char> = device_name.chars().collect();
|
||||||
|
let mut end_idx = chars.len();
|
||||||
|
|
||||||
|
// Find where the device name ends and partition number begins
|
||||||
|
for (i, &c) in chars.iter().enumerate().rev() {
|
||||||
|
if !c.is_ascii_digit() {
|
||||||
|
end_idx = i + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if end_idx > 0 && end_idx < chars.len() {
|
||||||
|
return Some(chars[..end_idx].iter().collect());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no partition detected, return as-is
|
||||||
|
Some(device_name.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Get filesystem info using df command
|
||||||
|
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
||||||
|
let output = Command::new("df")
|
||||||
|
.arg("--block-size=1")
|
||||||
|
.arg(path)
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
return Err(anyhow::anyhow!("df command failed for {}", path));
|
||||||
|
}
|
||||||
|
|
||||||
|
let output_str = String::from_utf8(output.stdout)?;
|
||||||
|
let lines: Vec<&str> = output_str.lines().collect();
|
||||||
|
|
||||||
|
if lines.len() < 2 {
|
||||||
|
return Err(anyhow::anyhow!("Unexpected df output format"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
||||||
|
if fields.len() < 4 {
|
||||||
|
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_bytes = fields[1].parse::<u64>()?;
|
||||||
|
let used_bytes = fields[2].parse::<u64>()?;
|
||||||
|
|
||||||
|
Ok((total_bytes, used_bytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Parse size string (e.g., "120G", "45M") to GB value
|
||||||
|
fn parse_size_to_gb(&self, size_str: &str) -> f32 {
|
||||||
|
let size_str = size_str.trim();
|
||||||
|
if size_str.is_empty() || size_str == "-" {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract numeric part and unit
|
||||||
|
let (num_str, unit) = if let Some(last_char) = size_str.chars().last() {
|
||||||
|
if last_char.is_alphabetic() {
|
||||||
|
let num_part = &size_str[..size_str.len() - 1];
|
||||||
|
let unit_part = &size_str[size_str.len() - 1..];
|
||||||
|
(num_part, unit_part)
|
||||||
|
} else {
|
||||||
|
(size_str, "")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
(size_str, "")
|
||||||
|
};
|
||||||
|
|
||||||
|
let number: f32 = num_str.parse().unwrap_or(0.0);
|
||||||
|
|
||||||
|
match unit.to_uppercase().as_str() {
|
||||||
|
"T" | "TB" => number * 1024.0,
|
||||||
|
"G" | "GB" => number,
|
||||||
|
"M" | "MB" => number / 1024.0,
|
||||||
|
"K" | "KB" => number / (1024.0 * 1024.0),
|
||||||
|
"B" | "" => number / (1024.0 * 1024.0 * 1024.0),
|
||||||
|
_ => number, // Assume GB if unknown unit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for DiskCollector {
|
||||||
|
|
||||||
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
let start_time = Instant::now();
|
||||||
|
debug!("Collecting storage pool and individual drive metrics");
|
||||||
|
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
|
||||||
|
// Get configured storage pools with individual drive data
|
||||||
|
let storage_pools = match self.get_configured_storage_pools() {
|
||||||
|
Ok(pools) => {
|
||||||
|
debug!("Found {} storage pools", pools.len());
|
||||||
|
pools
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get storage pools: {}", e);
|
||||||
|
Vec::new()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Generate metrics for each storage pool and its underlying drives
|
||||||
|
for storage_pool in &storage_pools {
|
||||||
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
|
// Storage pool overall metrics
|
||||||
|
let pool_name = &storage_pool.name;
|
||||||
|
|
||||||
|
// Parse size strings to get actual values for calculations
|
||||||
|
let size_gb = self.parse_size_to_gb(&storage_pool.size);
|
||||||
|
let used_gb = self.parse_size_to_gb(&storage_pool.used);
|
||||||
|
let avail_gb = self.parse_size_to_gb(&storage_pool.available);
|
||||||
|
|
||||||
|
// Calculate status based on configured thresholds
|
||||||
|
let pool_status = if storage_pool.usage_percent >= self.config.usage_critical_percent {
|
||||||
|
Status::Critical
|
||||||
|
} else if storage_pool.usage_percent >= self.config.usage_warning_percent {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
};
|
||||||
|
|
||||||
|
// Storage pool info metrics
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_mount_point", pool_name),
|
||||||
|
value: MetricValue::String(storage_pool.mount_point.clone()),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("Mount: {}", storage_pool.mount_point)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_filesystem", pool_name),
|
||||||
|
value: MetricValue::String(storage_pool.filesystem.clone()),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("FS: {}", storage_pool.filesystem)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_storage_type", pool_name),
|
||||||
|
value: MetricValue::String(storage_pool.storage_type.clone()),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("Type: {}", storage_pool.storage_type)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Storage pool size metrics
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_total_gb", pool_name),
|
||||||
|
value: MetricValue::Float(size_gb),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
description: Some(format!("Total: {}", storage_pool.size)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_used_gb", pool_name),
|
||||||
|
value: MetricValue::Float(used_gb),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
description: Some(format!("Used: {}", storage_pool.used)),
|
||||||
|
status: pool_status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_available_gb", pool_name),
|
||||||
|
value: MetricValue::Float(avail_gb),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
description: Some(format!("Available: {}", storage_pool.available)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_usage_percent", pool_name),
|
||||||
|
value: MetricValue::Float(storage_pool.usage_percent),
|
||||||
|
unit: Some("%".to_string()),
|
||||||
|
description: Some(format!("Usage: {:.1}%", storage_pool.usage_percent)),
|
||||||
|
status: pool_status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Individual drive metrics for this storage pool
|
||||||
|
for drive in &storage_pool.underlying_drives {
|
||||||
|
// Drive health status
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_{}_health", pool_name, drive.device),
|
||||||
|
value: MetricValue::String(drive.health_status.clone()),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("{}: {}", drive.device, drive.health_status)),
|
||||||
|
status: if drive.health_status == "PASSED" { Status::Ok }
|
||||||
|
else if drive.health_status == "FAILED" { Status::Critical }
|
||||||
|
else { Status::Unknown },
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Drive temperature
|
||||||
|
if let Some(temp) = drive.temperature {
|
||||||
|
let temp_status = self.calculate_temperature_status(
|
||||||
|
&format!("disk_{}_{}_temperature", pool_name, drive.device),
|
||||||
|
temp,
|
||||||
|
status_tracker
|
||||||
|
);
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_{}_temperature", pool_name, drive.device),
|
||||||
|
value: MetricValue::Float(temp),
|
||||||
|
unit: Some("°C".to_string()),
|
||||||
|
description: Some(format!("{}: {:.0}°C", drive.device, temp)),
|
||||||
|
status: temp_status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drive wear level (for SSDs)
|
||||||
|
if let Some(wear) = drive.wear_level {
|
||||||
|
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
||||||
|
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
||||||
|
else { Status::Ok };
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("disk_{}_{}_wear_percent", pool_name, drive.device),
|
||||||
|
value: MetricValue::Float(wear),
|
||||||
|
unit: Some("%".to_string()),
|
||||||
|
description: Some(format!("{}: {:.0}% wear", drive.device, wear)),
|
||||||
|
status: wear_status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add storage pool count metric
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "disk_count".to_string(),
|
||||||
|
value: MetricValue::Integer(storage_pools.len() as i64),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("Total storage pools: {}", storage_pools.len())),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
let collection_time = start_time.elapsed();
|
||||||
|
debug!(
|
||||||
|
"Multi-disk collection completed in {:?} with {} metrics",
|
||||||
|
collection_time,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -2,52 +2,9 @@ use thiserror::Error;
|
|||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum CollectorError {
|
pub enum CollectorError {
|
||||||
#[error("Command execution failed: {command} - {message}")]
|
#[error("Failed to read system file {path}: {error}")]
|
||||||
CommandFailed { command: String, message: String },
|
SystemRead { path: String, error: String },
|
||||||
|
|
||||||
#[error("Permission denied: {message}")]
|
#[error("Failed to parse value '{value}': {error}")]
|
||||||
PermissionDenied { message: String },
|
Parse { value: String, error: String },
|
||||||
|
|
||||||
#[error("Data parsing error: {message}")]
|
|
||||||
ParseError { message: String },
|
|
||||||
|
|
||||||
#[error("Timeout after {duration_ms}ms")]
|
|
||||||
Timeout { duration_ms: u64 },
|
|
||||||
|
|
||||||
#[error("IO error: {message}")]
|
|
||||||
IoError { message: String },
|
|
||||||
|
|
||||||
#[error("Configuration error: {message}")]
|
|
||||||
ConfigError { message: String },
|
|
||||||
|
|
||||||
#[error("Service not found: {service}")]
|
|
||||||
ServiceNotFound { service: String },
|
|
||||||
|
|
||||||
#[error("Device not found: {device}")]
|
|
||||||
DeviceNotFound { device: String },
|
|
||||||
|
|
||||||
#[error("External dependency error: {dependency} - {message}")]
|
|
||||||
ExternalDependency { dependency: String, message: String },
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<std::io::Error> for CollectorError {
|
|
||||||
fn from(err: std::io::Error) -> Self {
|
|
||||||
CollectorError::IoError {
|
|
||||||
message: err.to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<serde_json::Error> for CollectorError {
|
|
||||||
fn from(err: serde_json::Error) -> Self {
|
|
||||||
CollectorError::ParseError {
|
|
||||||
message: err.to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<tokio::time::error::Elapsed> for CollectorError {
|
|
||||||
fn from(_: tokio::time::error::Elapsed) -> Self {
|
|
||||||
CollectorError::Timeout { duration_ms: 0 }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
320
agent/src/collectors/memory.rs
Normal file
320
agent/src/collectors/memory.rs
Normal file
@@ -0,0 +1,320 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||||
|
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{utils, Collector, CollectorError};
|
||||||
|
use crate::config::MemoryConfig;
|
||||||
|
|
||||||
|
/// Extremely efficient memory metrics collector
|
||||||
|
///
|
||||||
|
/// EFFICIENCY OPTIMIZATIONS:
|
||||||
|
/// - Single /proc/meminfo read for all memory metrics
|
||||||
|
/// - Minimal string parsing with split operations
|
||||||
|
/// - Pre-calculated KB to GB conversion
|
||||||
|
/// - No regex or complex parsing
|
||||||
|
/// - <0.1ms collection time target
|
||||||
|
pub struct MemoryCollector {
|
||||||
|
usage_thresholds: HysteresisThresholds,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Memory information parsed from /proc/meminfo
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
struct MemoryInfo {
|
||||||
|
total_kb: u64,
|
||||||
|
available_kb: u64,
|
||||||
|
free_kb: u64,
|
||||||
|
buffers_kb: u64,
|
||||||
|
cached_kb: u64,
|
||||||
|
swap_total_kb: u64,
|
||||||
|
swap_free_kb: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MemoryCollector {
|
||||||
|
pub fn new(config: MemoryConfig) -> Self {
|
||||||
|
// Create hysteresis thresholds with 5% gap for memory usage
|
||||||
|
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||||
|
config.usage_warning_percent,
|
||||||
|
5.0, // 5% gap for warning recovery
|
||||||
|
config.usage_critical_percent,
|
||||||
|
5.0, // 5% gap for critical recovery
|
||||||
|
);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
usage_thresholds,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate memory usage status using hysteresis thresholds
|
||||||
|
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||||
|
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse /proc/meminfo efficiently
|
||||||
|
/// Format: "MemTotal: 16384000 kB"
|
||||||
|
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
||||||
|
let content = utils::read_proc_file("/proc/meminfo")?;
|
||||||
|
let mut info = MemoryInfo::default();
|
||||||
|
|
||||||
|
// Parse each line efficiently - only extract what we need
|
||||||
|
for line in content.lines() {
|
||||||
|
if let Some(colon_pos) = line.find(':') {
|
||||||
|
let key = &line[..colon_pos];
|
||||||
|
let value_part = &line[colon_pos + 1..];
|
||||||
|
|
||||||
|
// Extract number from value part (format: " 12345 kB")
|
||||||
|
if let Some(number_str) = value_part.split_whitespace().next() {
|
||||||
|
if let Ok(value_kb) = utils::parse_u64(number_str) {
|
||||||
|
match key {
|
||||||
|
"MemTotal" => info.total_kb = value_kb,
|
||||||
|
"MemAvailable" => info.available_kb = value_kb,
|
||||||
|
"MemFree" => info.free_kb = value_kb,
|
||||||
|
"Buffers" => info.buffers_kb = value_kb,
|
||||||
|
"Cached" => info.cached_kb = value_kb,
|
||||||
|
"SwapTotal" => info.swap_total_kb = value_kb,
|
||||||
|
"SwapFree" => info.swap_free_kb = value_kb,
|
||||||
|
_ => {} // Skip other fields for efficiency
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate that we got essential fields
|
||||||
|
if info.total_kb == 0 {
|
||||||
|
return Err(CollectorError::Parse {
|
||||||
|
value: "MemTotal".to_string(),
|
||||||
|
error: "MemTotal not found or zero in /proc/meminfo".to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// If MemAvailable is not available (older kernels), calculate it
|
||||||
|
if info.available_kb == 0 {
|
||||||
|
info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(info)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert KB to GB efficiently (avoiding floating point in hot path)
|
||||||
|
fn kb_to_gb(kb: u64) -> f32 {
|
||||||
|
kb as f32 / 1_048_576.0 // 1024 * 1024
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate memory metrics from parsed info
|
||||||
|
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
|
||||||
|
let mut metrics = Vec::with_capacity(6);
|
||||||
|
|
||||||
|
// Calculate derived values
|
||||||
|
let used_kb = info.total_kb - info.available_kb;
|
||||||
|
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
||||||
|
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
|
||||||
|
|
||||||
|
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
||||||
|
|
||||||
|
// Convert to GB for metrics
|
||||||
|
let total_gb = Self::kb_to_gb(info.total_kb);
|
||||||
|
let used_gb = Self::kb_to_gb(used_kb);
|
||||||
|
let available_gb = Self::kb_to_gb(info.available_kb);
|
||||||
|
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
|
||||||
|
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
||||||
|
|
||||||
|
// Memory usage percentage (primary metric with status)
|
||||||
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
|
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||||
|
MetricValue::Float(usage_percent),
|
||||||
|
usage_status,
|
||||||
|
)
|
||||||
|
.with_description("Memory usage percentage".to_string())
|
||||||
|
.with_unit("%".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Total memory
|
||||||
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
|
registry::MEMORY_TOTAL_GB.to_string(),
|
||||||
|
MetricValue::Float(total_gb),
|
||||||
|
Status::Ok, // Total memory doesn't have status
|
||||||
|
)
|
||||||
|
.with_description("Total system memory".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Used memory
|
||||||
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
|
registry::MEMORY_USED_GB.to_string(),
|
||||||
|
MetricValue::Float(used_gb),
|
||||||
|
Status::Ok, // Used memory absolute value doesn't have status
|
||||||
|
)
|
||||||
|
.with_description("Used system memory".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Available memory
|
||||||
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
|
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||||
|
MetricValue::Float(available_gb),
|
||||||
|
Status::Ok, // Available memory absolute value doesn't have status
|
||||||
|
)
|
||||||
|
.with_description("Available system memory".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Swap metrics (only if swap exists)
|
||||||
|
if info.swap_total_kb > 0 {
|
||||||
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
|
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||||
|
MetricValue::Float(swap_total_gb),
|
||||||
|
Status::Ok,
|
||||||
|
)
|
||||||
|
.with_description("Total swap space".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
|
metrics.push(
|
||||||
|
Metric::new(
|
||||||
|
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||||
|
MetricValue::Float(swap_used_gb),
|
||||||
|
Status::Ok,
|
||||||
|
)
|
||||||
|
.with_description("Used swap space".to_string())
|
||||||
|
.with_unit("GB".to_string()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Monitor tmpfs (/tmp) usage
|
||||||
|
if let Ok(tmpfs_metrics) = self.get_tmpfs_metrics(status_tracker) {
|
||||||
|
metrics.extend(tmpfs_metrics);
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get tmpfs (/tmp) usage metrics
|
||||||
|
fn get_tmpfs_metrics(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
let output = Command::new("df")
|
||||||
|
.arg("--block-size=1")
|
||||||
|
.arg("/tmp")
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: "/tmp".to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
return Ok(Vec::new()); // Return empty if /tmp not available
|
||||||
|
}
|
||||||
|
|
||||||
|
let output_str = String::from_utf8(output.stdout)
|
||||||
|
.map_err(|e| CollectorError::Parse {
|
||||||
|
value: "df output".to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let lines: Vec<&str> = output_str.lines().collect();
|
||||||
|
if lines.len() < 2 {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
||||||
|
if fields.len() < 4 {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_bytes: u64 = fields[1].parse()
|
||||||
|
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||||
|
value: fields[1].to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
let used_bytes: u64 = fields[2].parse()
|
||||||
|
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||||
|
value: fields[2].to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||||
|
let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||||
|
let usage_percent = if total_bytes > 0 {
|
||||||
|
(used_bytes as f32 / total_bytes as f32) * 100.0
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
|
// Calculate status using same thresholds as main memory
|
||||||
|
let tmp_status = self.calculate_usage_status("memory_tmp_usage_percent", usage_percent, status_tracker);
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "memory_tmp_usage_percent".to_string(),
|
||||||
|
value: MetricValue::Float(usage_percent),
|
||||||
|
unit: Some("%".to_string()),
|
||||||
|
description: Some("tmpfs /tmp usage percentage".to_string()),
|
||||||
|
status: tmp_status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "memory_tmp_used_gb".to_string(),
|
||||||
|
value: MetricValue::Float(used_gb),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
description: Some("tmpfs /tmp used space".to_string()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "memory_tmp_total_gb".to_string(),
|
||||||
|
value: MetricValue::Float(total_gb),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
description: Some("tmpfs /tmp total space".to_string()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for MemoryCollector {
|
||||||
|
|
||||||
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
debug!("Collecting memory metrics");
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
|
// Parse memory info from /proc/meminfo
|
||||||
|
let info = self.parse_meminfo().await?;
|
||||||
|
|
||||||
|
// Calculate all metrics from parsed info
|
||||||
|
let metrics = self.calculate_metrics(&info, status_tracker);
|
||||||
|
|
||||||
|
let duration = start.elapsed();
|
||||||
|
debug!(
|
||||||
|
"Memory collection completed in {:?} with {} metrics",
|
||||||
|
duration,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Efficiency check: warn if collection takes too long
|
||||||
|
if duration.as_millis() > 1 {
|
||||||
|
debug!(
|
||||||
|
"Memory collection took {}ms - consider optimization",
|
||||||
|
duration.as_millis()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store performance metrics
|
||||||
|
// Performance tracking handled by cache system
|
||||||
|
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -1,28 +1,99 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::Value;
|
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
pub mod backup;
|
pub mod backup;
|
||||||
|
pub mod cpu;
|
||||||
|
pub mod disk;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod service;
|
pub mod memory;
|
||||||
pub mod smart;
|
pub mod nixos;
|
||||||
pub mod system;
|
pub mod systemd;
|
||||||
|
|
||||||
pub use error::CollectorError;
|
pub use error::CollectorError;
|
||||||
|
|
||||||
pub use cm_dashboard_shared::envelope::AgentType;
|
|
||||||
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct CollectorOutput {
|
|
||||||
pub agent_type: AgentType,
|
|
||||||
pub data: Value,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/// Base trait for all collectors with extreme efficiency requirements
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait Collector: Send + Sync {
|
pub trait Collector: Send + Sync {
|
||||||
fn name(&self) -> &str;
|
/// Collect all metrics this collector provides
|
||||||
fn agent_type(&self) -> AgentType;
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
|
||||||
fn collect_interval(&self) -> Duration;
|
|
||||||
async fn collect(&self) -> Result<CollectorOutput, CollectorError>;
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CPU efficiency rules for all collectors
|
||||||
|
pub mod efficiency {
|
||||||
|
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
||||||
|
//!
|
||||||
|
//! # FILE READING RULES
|
||||||
|
//! - Read entire files in single syscall when possible
|
||||||
|
//! - Use BufReader only for very large files (>4KB)
|
||||||
|
//! - Never read files character by character
|
||||||
|
//! - Cache file descriptors when safe (immutable paths)
|
||||||
|
//!
|
||||||
|
//! # PARSING RULES
|
||||||
|
//! - Use split() instead of regex for simple patterns
|
||||||
|
//! - Parse numbers with from_str() not complex parsing
|
||||||
|
//! - Avoid string allocations in hot paths
|
||||||
|
//! - Use str::trim() before parsing numbers
|
||||||
|
//!
|
||||||
|
//! # MEMORY ALLOCATION RULES
|
||||||
|
//! - Reuse Vec buffers when possible
|
||||||
|
//! - Pre-allocate collections with known sizes
|
||||||
|
//! - Use str slices instead of String when possible
|
||||||
|
//! - Avoid clone() in hot paths
|
||||||
|
//!
|
||||||
|
//! # SYSTEM CALL RULES
|
||||||
|
//! - Minimize syscalls - prefer single reads over multiple
|
||||||
|
//! - Use /proc filesystem efficiently
|
||||||
|
//! - Avoid spawning processes when /proc data available
|
||||||
|
//! - Cache static data (like CPU count)
|
||||||
|
//!
|
||||||
|
//! # ERROR HANDLING RULES
|
||||||
|
//! - Use Result<> but minimize allocation in error paths
|
||||||
|
//! - Log errors at debug level only to avoid I/O overhead
|
||||||
|
//! - Graceful degradation - missing metrics better than failing
|
||||||
|
//! - Never panic in collectors
|
||||||
|
//!
|
||||||
|
//! # CONCURRENCY RULES
|
||||||
|
//! - Collectors must be thread-safe but avoid locks
|
||||||
|
//! - Use atomic operations for simple counters
|
||||||
|
//! - Avoid shared mutable state between collections
|
||||||
|
//! - Each collection should be independent
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Utility functions for efficient system data collection
|
||||||
|
pub mod utils {
|
||||||
|
use super::CollectorError;
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
/// Read entire file content efficiently
|
||||||
|
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
|
||||||
|
fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: path.to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse float from string slice efficiently
|
||||||
|
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
|
||||||
|
s.trim()
|
||||||
|
.parse()
|
||||||
|
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
||||||
|
value: s.to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse integer from string slice efficiently
|
||||||
|
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
|
||||||
|
s.trim()
|
||||||
|
.parse()
|
||||||
|
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||||
|
value: s.to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
172
agent/src/collectors/nixos.rs
Normal file
172
agent/src/collectors/nixos.rs
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||||
|
use std::process::Command;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{Collector, CollectorError};
|
||||||
|
use crate::config::NixOSConfig;
|
||||||
|
|
||||||
|
/// NixOS system information collector
|
||||||
|
///
|
||||||
|
/// Collects NixOS-specific system information including:
|
||||||
|
/// - NixOS version and build information
|
||||||
|
pub struct NixOSCollector {
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NixOSCollector {
|
||||||
|
pub fn new(_config: NixOSConfig) -> Self {
|
||||||
|
Self {}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Get agent hash from binary path
|
||||||
|
fn get_agent_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
// Get the path of the current executable
|
||||||
|
let exe_path = std::env::current_exe()?;
|
||||||
|
let exe_str = exe_path.to_string_lossy();
|
||||||
|
|
||||||
|
// Extract Nix store hash from path like /nix/store/fn804fh332mp8gz06qawminpj20xl25h-cm-dashboard-0.1.0/bin/cm-dashboard-agent
|
||||||
|
if let Some(store_path) = exe_str.strip_prefix("/nix/store/") {
|
||||||
|
if let Some(dash_pos) = store_path.find('-') {
|
||||||
|
return Ok(store_path[..dash_pos].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to "unknown" if not in Nix store
|
||||||
|
Ok("unknown".to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get configuration hash from deployed nix store system
|
||||||
|
/// Get git commit hash from rebuild process
|
||||||
|
fn get_git_commit(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
||||||
|
match std::fs::read_to_string(commit_file) {
|
||||||
|
Ok(content) => {
|
||||||
|
let commit_hash = content.trim();
|
||||||
|
if commit_hash.len() >= 7 {
|
||||||
|
Ok(commit_hash.to_string())
|
||||||
|
} else {
|
||||||
|
Err("Git commit hash too short".into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Err(format!("Failed to read git commit file: {}", e).into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_config_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
// Read the symlink target of /run/current-system to get nix store path
|
||||||
|
let output = Command::new("readlink")
|
||||||
|
.arg("/run/current-system")
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
return Err("readlink command failed".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let binding = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let store_path = binding.trim();
|
||||||
|
|
||||||
|
// Extract hash from nix store path
|
||||||
|
// Format: /nix/store/HASH-nixos-system-HOSTNAME-VERSION
|
||||||
|
if let Some(hash_part) = store_path.strip_prefix("/nix/store/") {
|
||||||
|
if let Some(hash) = hash_part.split('-').next() {
|
||||||
|
if hash.len() >= 8 {
|
||||||
|
// Return first 8 characters of nix store hash
|
||||||
|
return Ok(hash[..8].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err("Could not extract hash from nix store path".into())
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for NixOSCollector {
|
||||||
|
|
||||||
|
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
debug!("Collecting NixOS system information");
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
|
// Collect git commit information (shows what's actually deployed)
|
||||||
|
match self.get_git_commit() {
|
||||||
|
Ok(git_commit) => {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "system_nixos_build".to_string(),
|
||||||
|
value: MetricValue::String(git_commit),
|
||||||
|
unit: None,
|
||||||
|
description: Some("Git commit hash of deployed configuration".to_string()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get git commit: {}", e);
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "system_nixos_build".to_string(),
|
||||||
|
value: MetricValue::String("unknown".to_string()),
|
||||||
|
unit: None,
|
||||||
|
description: Some("Git commit hash (failed to detect)".to_string()),
|
||||||
|
status: Status::Unknown,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Collect config hash
|
||||||
|
match self.get_config_hash() {
|
||||||
|
Ok(hash) => {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "system_config_hash".to_string(),
|
||||||
|
value: MetricValue::String(hash),
|
||||||
|
unit: None,
|
||||||
|
description: Some("NixOS deployed configuration hash".to_string()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get config hash: {}", e);
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "system_config_hash".to_string(),
|
||||||
|
value: MetricValue::String("unknown".to_string()),
|
||||||
|
unit: None,
|
||||||
|
description: Some("Deployed config hash (failed to detect)".to_string()),
|
||||||
|
status: Status::Unknown,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect agent hash
|
||||||
|
match self.get_agent_hash() {
|
||||||
|
Ok(hash) => {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "system_agent_hash".to_string(),
|
||||||
|
value: MetricValue::String(hash),
|
||||||
|
unit: None,
|
||||||
|
description: Some("Agent Nix store hash".to_string()),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get agent hash: {}", e);
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: "system_agent_hash".to_string(),
|
||||||
|
value: MetricValue::String("unknown".to_string()),
|
||||||
|
unit: None,
|
||||||
|
description: Some("Agent hash (failed to detect)".to_string()),
|
||||||
|
status: Status::Unknown,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("Collected {} NixOS metrics", metrics.len());
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,1443 +0,0 @@
|
|||||||
use async_trait::async_trait;
|
|
||||||
use chrono::Utc;
|
|
||||||
use serde::Serialize;
|
|
||||||
use serde_json::json;
|
|
||||||
use std::process::Stdio;
|
|
||||||
use std::time::{Duration, Instant};
|
|
||||||
use tokio::fs;
|
|
||||||
use tokio::process::Command;
|
|
||||||
use tokio::time::timeout;
|
|
||||||
|
|
||||||
use super::{AgentType, Collector, CollectorError, CollectorOutput};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct ServiceCollector {
|
|
||||||
pub interval: Duration,
|
|
||||||
pub services: Vec<String>,
|
|
||||||
pub timeout_ms: u64,
|
|
||||||
pub cpu_tracking: std::sync::Arc<tokio::sync::Mutex<std::collections::HashMap<u32, CpuSample>>>,
|
|
||||||
pub description_cache: std::sync::Arc<tokio::sync::Mutex<std::collections::HashMap<String, Vec<String>>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub(crate) struct CpuSample {
|
|
||||||
utime: u64,
|
|
||||||
stime: u64,
|
|
||||||
timestamp: std::time::Instant,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ServiceCollector {
|
|
||||||
pub fn new(_enabled: bool, interval_ms: u64, services: Vec<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
interval: Duration::from_millis(interval_ms),
|
|
||||||
services,
|
|
||||||
timeout_ms: 10000, // 10 second timeout for service checks
|
|
||||||
cpu_tracking: std::sync::Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())),
|
|
||||||
description_cache: std::sync::Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_service_status(&self, service: &str) -> Result<ServiceData, CollectorError> {
|
|
||||||
let timeout_duration = Duration::from_millis(self.timeout_ms);
|
|
||||||
|
|
||||||
// Use more efficient systemctl command - just get the essential info
|
|
||||||
let status_output = timeout(
|
|
||||||
timeout_duration,
|
|
||||||
Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["show", service, "--property=ActiveState,SubState,MainPID", "--no-pager"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|_| CollectorError::Timeout {
|
|
||||||
duration_ms: self.timeout_ms,
|
|
||||||
})?
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("systemctl show {}", service),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !status_output.status.success() {
|
|
||||||
return Err(CollectorError::ServiceNotFound {
|
|
||||||
service: service.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let status_stdout = String::from_utf8_lossy(&status_output.stdout);
|
|
||||||
let mut active_state = None;
|
|
||||||
let mut sub_state = None;
|
|
||||||
let mut main_pid = None;
|
|
||||||
|
|
||||||
for line in status_stdout.lines() {
|
|
||||||
if let Some(value) = line.strip_prefix("ActiveState=") {
|
|
||||||
active_state = Some(value.to_string());
|
|
||||||
} else if let Some(value) = line.strip_prefix("SubState=") {
|
|
||||||
sub_state = Some(value.to_string());
|
|
||||||
} else if let Some(value) = line.strip_prefix("MainPID=") {
|
|
||||||
main_pid = value.parse::<u32>().ok();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if service is sandboxed (needed for status determination)
|
|
||||||
let is_sandboxed = self.check_service_sandbox(service).await.unwrap_or(false);
|
|
||||||
let is_sandbox_excluded = self.is_sandbox_excluded(service);
|
|
||||||
|
|
||||||
let status = self.determine_service_status(&active_state, &sub_state, is_sandboxed, service);
|
|
||||||
|
|
||||||
// Get resource usage if service is running
|
|
||||||
let (memory_used_mb, cpu_percent) = if let Some(pid) = main_pid {
|
|
||||||
self.get_process_resources(pid).await.unwrap_or((0.0, 0.0))
|
|
||||||
} else {
|
|
||||||
(0.0, 0.0)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get memory quota from systemd if available
|
|
||||||
let memory_quota_mb = self.get_service_memory_limit(service).await.unwrap_or(0.0);
|
|
||||||
|
|
||||||
// Get disk usage for this service (only for running services)
|
|
||||||
let disk_used_gb = if matches!(status, ServiceStatus::Running) {
|
|
||||||
self.get_service_disk_usage(service).await.unwrap_or(0.0)
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get disk quota for this service (if configured)
|
|
||||||
let disk_quota_gb = if matches!(status, ServiceStatus::Running) {
|
|
||||||
self.get_service_disk_quota(service).await.unwrap_or(0.0)
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get service-specific description (only for running services)
|
|
||||||
let description = if matches!(status, ServiceStatus::Running) {
|
|
||||||
self.get_service_description_with_cache(service).await
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(ServiceData {
|
|
||||||
name: service.to_string(),
|
|
||||||
status,
|
|
||||||
memory_used_mb,
|
|
||||||
memory_quota_mb,
|
|
||||||
cpu_percent,
|
|
||||||
sandbox_limit: None, // TODO: Implement sandbox limit detection
|
|
||||||
disk_used_gb,
|
|
||||||
disk_quota_gb,
|
|
||||||
is_sandboxed,
|
|
||||||
is_sandbox_excluded,
|
|
||||||
description,
|
|
||||||
sub_service: None,
|
|
||||||
latency_ms: None,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_sandbox_excluded(&self, service: &str) -> bool {
|
|
||||||
// Services that don't need sandboxing due to their nature
|
|
||||||
matches!(service,
|
|
||||||
"sshd" | "ssh" | // SSH needs system access for auth/shell
|
|
||||||
"docker" | // Docker needs broad system access
|
|
||||||
"systemd-logind" | // System service
|
|
||||||
"systemd-resolved" | // System service
|
|
||||||
"dbus" | // System service
|
|
||||||
"NetworkManager" | // Network management
|
|
||||||
"wpa_supplicant" // WiFi management
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_service_status(
|
|
||||||
&self,
|
|
||||||
active_state: &Option<String>,
|
|
||||||
sub_state: &Option<String>,
|
|
||||||
is_sandboxed: bool,
|
|
||||||
service_name: &str,
|
|
||||||
) -> ServiceStatus {
|
|
||||||
match (active_state.as_deref(), sub_state.as_deref()) {
|
|
||||||
(Some("active"), Some("running")) => {
|
|
||||||
// Check if service is excluded from sandbox requirements
|
|
||||||
if self.is_sandbox_excluded(service_name) || is_sandboxed {
|
|
||||||
ServiceStatus::Running
|
|
||||||
} else {
|
|
||||||
ServiceStatus::Degraded // Warning status for unsandboxed running services
|
|
||||||
}
|
|
||||||
},
|
|
||||||
(Some("active"), Some("exited")) => {
|
|
||||||
// One-shot services should also be degraded if not sandboxed
|
|
||||||
if self.is_sandbox_excluded(service_name) || is_sandboxed {
|
|
||||||
ServiceStatus::Running
|
|
||||||
} else {
|
|
||||||
ServiceStatus::Degraded
|
|
||||||
}
|
|
||||||
},
|
|
||||||
(Some("reloading"), _) | (Some("activating"), _) => ServiceStatus::Restarting,
|
|
||||||
(Some("failed"), _) | (Some("inactive"), Some("failed")) => ServiceStatus::Stopped,
|
|
||||||
(Some("inactive"), _) => ServiceStatus::Stopped,
|
|
||||||
_ => ServiceStatus::Degraded,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_process_resources(&self, pid: u32) -> Result<(f32, f32), CollectorError> {
|
|
||||||
// Read /proc/{pid}/stat for CPU and memory info
|
|
||||||
let stat_path = format!("/proc/{}/stat", pid);
|
|
||||||
let stat_content =
|
|
||||||
fs::read_to_string(&stat_path)
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError {
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let stat_fields: Vec<&str> = stat_content.split_whitespace().collect();
|
|
||||||
if stat_fields.len() < 24 {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: format!("Invalid /proc/{}/stat format", pid),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Field 23 is RSS (Resident Set Size) in pages
|
|
||||||
let rss_pages: u64 = stat_fields[23]
|
|
||||||
.parse()
|
|
||||||
.map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse RSS from /proc/{}/stat: {}", pid, e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// Convert pages to MB (assuming 4KB pages)
|
|
||||||
let memory_mb = (rss_pages * 4) as f32 / 1024.0;
|
|
||||||
|
|
||||||
// Calculate CPU percentage
|
|
||||||
let cpu_percent = self.calculate_cpu_usage(pid, &stat_fields).await.unwrap_or(0.0);
|
|
||||||
|
|
||||||
Ok((memory_mb, cpu_percent))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn calculate_cpu_usage(&self, pid: u32, stat_fields: &[&str]) -> Result<f32, CollectorError> {
|
|
||||||
// Parse CPU time fields from /proc/pid/stat
|
|
||||||
let utime: u64 = stat_fields[13].parse().map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse utime: {}", e),
|
|
||||||
})?;
|
|
||||||
let stime: u64 = stat_fields[14].parse().map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse stime: {}", e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let now = std::time::Instant::now();
|
|
||||||
let current_sample = CpuSample {
|
|
||||||
utime,
|
|
||||||
stime,
|
|
||||||
timestamp: now,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut cpu_tracking = self.cpu_tracking.lock().await;
|
|
||||||
|
|
||||||
let cpu_percent = if let Some(previous_sample) = cpu_tracking.get(&pid) {
|
|
||||||
let time_delta = now.duration_since(previous_sample.timestamp).as_secs_f32();
|
|
||||||
if time_delta > 0.1 { // At least 100ms between samples
|
|
||||||
let utime_delta = current_sample.utime.saturating_sub(previous_sample.utime);
|
|
||||||
let stime_delta = current_sample.stime.saturating_sub(previous_sample.stime);
|
|
||||||
let total_delta = utime_delta + stime_delta;
|
|
||||||
|
|
||||||
// Convert from jiffies to CPU percentage
|
|
||||||
// sysconf(_SC_CLK_TCK) is typically 100 on Linux
|
|
||||||
let hz = 100.0; // Clock ticks per second
|
|
||||||
let cpu_time_used = total_delta as f32 / hz;
|
|
||||||
let cpu_percent = (cpu_time_used / time_delta) * 100.0;
|
|
||||||
|
|
||||||
// Cap at reasonable values
|
|
||||||
cpu_percent.min(999.9)
|
|
||||||
} else {
|
|
||||||
0.0 // Too soon for accurate measurement
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
0.0 // First measurement, no baseline
|
|
||||||
};
|
|
||||||
|
|
||||||
// Store current sample for next calculation
|
|
||||||
cpu_tracking.insert(pid, current_sample);
|
|
||||||
|
|
||||||
// Clean up old entries (processes that no longer exist)
|
|
||||||
let cutoff = now - Duration::from_secs(300); // 5 minutes
|
|
||||||
cpu_tracking.retain(|_, sample| sample.timestamp > cutoff);
|
|
||||||
|
|
||||||
Ok(cpu_percent)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_service_disk_usage(&self, service: &str) -> Result<f32, CollectorError> {
|
|
||||||
// Only check the most likely path to avoid multiple du calls
|
|
||||||
let primary_path = format!("/var/lib/{}", service);
|
|
||||||
|
|
||||||
// Use a quick check first - if directory doesn't exist, don't run du
|
|
||||||
if tokio::fs::metadata(&primary_path).await.is_err() {
|
|
||||||
return Ok(0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.get_directory_size(&primary_path).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_directory_size(&self, path: &str) -> Result<f32, CollectorError> {
|
|
||||||
let output = Command::new("sudo")
|
|
||||||
.args(["/run/current-system/sw/bin/du", "-s", "-k", path]) // Use kilobytes instead of forcing GB
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("du -s -k {}", path),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
// Directory doesn't exist or permission denied - return 0
|
|
||||||
return Ok(0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
if let Some(line) = stdout.lines().next() {
|
|
||||||
if let Some(size_str) = line.split_whitespace().next() {
|
|
||||||
let size_kb = size_str.parse::<f32>().unwrap_or(0.0);
|
|
||||||
let size_gb = size_kb / (1024.0 * 1024.0); // Convert KB to GB
|
|
||||||
return Ok(size_gb);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(0.0)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_service_disk_quota(&self, service: &str) -> Result<f32, CollectorError> {
|
|
||||||
// Check systemd service properties for NixOS hardening-related disk restrictions
|
|
||||||
let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,ReadOnlyPaths,InaccessiblePaths,BindPaths,BindReadOnlyPaths", "--no-pager"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
if let Ok(output) = systemd_output {
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
|
|
||||||
// Parse systemd properties that might indicate disk restrictions
|
|
||||||
let mut private_tmp = false;
|
|
||||||
let mut protect_system = false;
|
|
||||||
let mut readonly_paths = Vec::new();
|
|
||||||
|
|
||||||
for line in stdout.lines() {
|
|
||||||
if line.starts_with("PrivateTmp=yes") {
|
|
||||||
private_tmp = true;
|
|
||||||
} else if line.starts_with("ProtectSystem=strict") || line.starts_with("ProtectSystem=yes") {
|
|
||||||
protect_system = true;
|
|
||||||
} else if let Some(paths) = line.strip_prefix("ReadOnlyPaths=") {
|
|
||||||
readonly_paths.push(paths.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If service has significant restrictions, it might have implicit disk limits
|
|
||||||
// This is heuristic-based since systemd doesn't have direct disk quotas
|
|
||||||
if private_tmp && protect_system {
|
|
||||||
// Heavily sandboxed services might have practical disk limits
|
|
||||||
// Return a conservative estimate based on typical service needs
|
|
||||||
return Ok(1.0); // 1 GB as reasonable limit for sandboxed services
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for service-specific disk configurations in NixOS
|
|
||||||
match service {
|
|
||||||
"docker" => {
|
|
||||||
// Docker might have storage driver limits in NixOS config
|
|
||||||
if let Ok(limit) = self.get_docker_storage_quota().await {
|
|
||||||
return Ok(limit);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"postgresql" | "postgres" => {
|
|
||||||
// PostgreSQL might have tablespace or data directory limits
|
|
||||||
// Check for database-specific storage configuration
|
|
||||||
},
|
|
||||||
"mysql" | "mariadb" => {
|
|
||||||
// MySQL might have data directory size limits
|
|
||||||
},
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// No quota found
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: format!("No disk quota found for service {}", service),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn check_filesystem_quota(&self, path: &str) -> Result<f32, CollectorError> {
|
|
||||||
// Try to get filesystem quota information
|
|
||||||
let quota_output = Command::new("quota")
|
|
||||||
.args(["-f", path])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
if let Ok(output) = quota_output {
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
// Parse quota output (simplified implementation)
|
|
||||||
for line in stdout.lines() {
|
|
||||||
if line.contains("blocks") && line.contains("quota") {
|
|
||||||
// This would need proper parsing based on quota output format
|
|
||||||
// For now, return error indicating no quota parsing implemented
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: "No filesystem quota detected".to_string(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_docker_storage_quota(&self) -> Result<f32, CollectorError> {
|
|
||||||
// Check if Docker has storage limits configured
|
|
||||||
// This is a simplified check - full implementation would check storage driver settings
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: "Docker storage quota detection not implemented".to_string(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn check_service_sandbox(&self, service: &str) -> Result<bool, CollectorError> {
|
|
||||||
// Check systemd service properties for sandboxing/hardening settings
|
|
||||||
let systemd_output = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["show", service, "--property=PrivateTmp,ProtectHome,ProtectSystem,NoNewPrivileges,PrivateDevices,ProtectKernelTunables,RestrictRealtime", "--no-pager"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
if let Ok(output) = systemd_output {
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
|
|
||||||
let mut sandbox_indicators = 0;
|
|
||||||
let mut total_checks = 0;
|
|
||||||
|
|
||||||
for line in stdout.lines() {
|
|
||||||
total_checks += 1;
|
|
||||||
|
|
||||||
// Check for various sandboxing properties
|
|
||||||
if line.starts_with("PrivateTmp=yes") ||
|
|
||||||
line.starts_with("ProtectHome=yes") ||
|
|
||||||
line.starts_with("ProtectSystem=strict") ||
|
|
||||||
line.starts_with("ProtectSystem=yes") ||
|
|
||||||
line.starts_with("NoNewPrivileges=yes") ||
|
|
||||||
line.starts_with("PrivateDevices=yes") ||
|
|
||||||
line.starts_with("ProtectKernelTunables=yes") ||
|
|
||||||
line.starts_with("RestrictRealtime=yes") {
|
|
||||||
sandbox_indicators += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Consider service sandboxed if it has multiple hardening features
|
|
||||||
let is_sandboxed = sandbox_indicators >= 3;
|
|
||||||
return Ok(is_sandboxed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Default to not sandboxed if we can't determine
|
|
||||||
Ok(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_service_memory_limit(&self, service: &str) -> Result<f32, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["show", service, "--property=MemoryMax", "--no-pager"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("systemctl show {} --property=MemoryMax", service),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
for line in stdout.lines() {
|
|
||||||
if let Some(value) = line.strip_prefix("MemoryMax=") {
|
|
||||||
if value == "infinity" {
|
|
||||||
return Ok(0.0); // No limit
|
|
||||||
}
|
|
||||||
if let Ok(bytes) = value.parse::<u64>() {
|
|
||||||
return Ok(bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(0.0) // No limit or couldn't parse
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn get_system_memory_total(&self) -> Result<f32, CollectorError> {
|
|
||||||
// Read /proc/meminfo to get total system memory
|
|
||||||
let meminfo = fs::read_to_string("/proc/meminfo")
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError {
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
for line in meminfo.lines() {
|
|
||||||
if let Some(mem_total_line) = line.strip_prefix("MemTotal:") {
|
|
||||||
let parts: Vec<&str> = mem_total_line.trim().split_whitespace().collect();
|
|
||||||
if let Some(mem_kb_str) = parts.first() {
|
|
||||||
if let Ok(mem_kb) = mem_kb_str.parse::<f32>() {
|
|
||||||
return Ok(mem_kb / 1024.0); // Convert KB to MB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: "Could not parse total memory".to_string(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/df")
|
|
||||||
.args(["-BG", "--output=size,used,avail", "/"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: "df -BG --output=size,used,avail /".to_string(),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: "df -BG --output=size,used,avail /".to_string(),
|
|
||||||
message: stderr.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let lines: Vec<&str> = stdout.lines().collect();
|
|
||||||
|
|
||||||
if lines.len() < 2 {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: "Unexpected df output format".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let data_line = lines[1].trim();
|
|
||||||
let parts: Vec<&str> = data_line.split_whitespace().collect();
|
|
||||||
if parts.len() < 3 {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: format!("Unexpected df data format: {}", data_line),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let parse_size = |s: &str| -> Result<f32, CollectorError> {
|
|
||||||
s.trim_end_matches('G')
|
|
||||||
.parse::<f32>()
|
|
||||||
.map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse disk size '{}': {}", s, e),
|
|
||||||
})
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(DiskUsage {
|
|
||||||
total_capacity_gb: parse_size(parts[0])?,
|
|
||||||
used_gb: parse_size(parts[1])?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
fn determine_services_status(&self, healthy: usize, degraded: usize, failed: usize) -> String {
|
|
||||||
if failed > 0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if degraded > 0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else if healthy > 0 {
|
|
||||||
"ok".to_string()
|
|
||||||
} else {
|
|
||||||
"unknown".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn get_gpu_metrics(&self) -> (Option<f32>, Option<f32>) {
|
|
||||||
let output = Command::new("nvidia-smi")
|
|
||||||
.args([
|
|
||||||
"--query-gpu=utilization.gpu,temperature.gpu",
|
|
||||||
"--format=csv,noheader,nounits",
|
|
||||||
])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
match output {
|
|
||||||
Ok(result) if result.status.success() => {
|
|
||||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
|
||||||
if let Some(line) = stdout.lines().next() {
|
|
||||||
let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
|
|
||||||
if parts.len() >= 2 {
|
|
||||||
let load = parts[0].parse::<f32>().ok();
|
|
||||||
let temp = parts[1].parse::<f32>().ok();
|
|
||||||
return (load, temp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(None, None)
|
|
||||||
}
|
|
||||||
Ok(_) | Err(_) => {
|
|
||||||
let util_output = Command::new("/opt/vc/bin/vcgencmd")
|
|
||||||
.arg("measure_temp")
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
if let Ok(result) = util_output {
|
|
||||||
if result.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
|
||||||
if let Some(value) = stdout
|
|
||||||
.trim()
|
|
||||||
.strip_prefix("temp=")
|
|
||||||
.and_then(|s| s.strip_suffix("'C"))
|
|
||||||
{
|
|
||||||
if let Ok(temp_c) = value.parse::<f32>() {
|
|
||||||
return (None, Some(temp_c));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(None, None)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn get_service_description_with_cache(&self, service: &str) -> Option<Vec<String>> {
|
|
||||||
// Check if we should update the cache (throttled)
|
|
||||||
let should_update = self.should_update_description(service).await;
|
|
||||||
|
|
||||||
if should_update {
|
|
||||||
if let Some(new_description) = self.get_service_description(service).await {
|
|
||||||
// Update cache
|
|
||||||
let mut cache = self.description_cache.lock().await;
|
|
||||||
cache.insert(service.to_string(), new_description.clone());
|
|
||||||
return Some(new_description);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Always return cached description if available
|
|
||||||
let cache = self.description_cache.lock().await;
|
|
||||||
cache.get(service).cloned()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn should_update_description(&self, _service: &str) -> bool {
|
|
||||||
// For now, always update descriptions since we have caching
|
|
||||||
// The cache will prevent redundant work
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_service_description(&self, service: &str) -> Option<Vec<String>> {
|
|
||||||
let result = match service {
|
|
||||||
// KEEP: nginx sites and docker containers (needed for sub-services)
|
|
||||||
"nginx" => self.get_nginx_description().await.map(|s| vec![s]),
|
|
||||||
"docker" => self.get_docker_containers().await,
|
|
||||||
|
|
||||||
// DISABLED: All connection monitoring for CPU/C-state testing
|
|
||||||
/*
|
|
||||||
"sshd" | "ssh" => self.get_ssh_active_users().await.map(|s| vec![s]),
|
|
||||||
"apache2" | "httpd" => self.get_web_server_connections().await.map(|s| vec![s]),
|
|
||||||
"docker-registry" => self.get_docker_registry_info().await.map(|s| vec![s]),
|
|
||||||
"postgresql" | "postgres" => self.get_postgres_connections().await.map(|s| vec![s]),
|
|
||||||
"mysql" | "mariadb" => self.get_mysql_connections().await.map(|s| vec![s]),
|
|
||||||
"redis" | "redis-immich" => self.get_redis_info().await.map(|s| vec![s]),
|
|
||||||
"immich-server" => self.get_immich_info().await.map(|s| vec![s]),
|
|
||||||
"vaultwarden" => self.get_vaultwarden_info().await.map(|s| vec![s]),
|
|
||||||
"unifi" => self.get_unifi_info().await.map(|s| vec![s]),
|
|
||||||
"mosquitto" => self.get_mosquitto_info().await.map(|s| vec![s]),
|
|
||||||
"haasp-webgrid" => self.get_haasp_webgrid_info().await.map(|s| vec![s]),
|
|
||||||
*/
|
|
||||||
_ => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_ssh_active_users(&self) -> Option<String> {
|
|
||||||
// Use ss to find established SSH connections on port 22
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "sport", "= :22"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let mut connections = 0;
|
|
||||||
|
|
||||||
// Count lines excluding header
|
|
||||||
for line in stdout.lines().skip(1) {
|
|
||||||
if !line.trim().is_empty() {
|
|
||||||
connections += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if connections > 0 {
|
|
||||||
Some(format!("{} connections", connections))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_web_server_connections(&self) -> Option<String> {
|
|
||||||
// Use simpler ss command with minimal output
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "sport", ":80", "or", "sport", ":443"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1); // Subtract header line
|
|
||||||
|
|
||||||
if connection_count > 0 {
|
|
||||||
Some(format!("{} connections", connection_count))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_docker_containers(&self) -> Option<Vec<String>> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/docker")
|
|
||||||
.args(["ps", "--format", "{{.Names}}"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let containers: Vec<String> = stdout
|
|
||||||
.lines()
|
|
||||||
.filter(|line| !line.trim().is_empty())
|
|
||||||
.map(|line| line.trim().to_string())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if containers.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(containers)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_postgres_connections(&self) -> Option<String> {
|
|
||||||
let output = Command::new("sudo")
|
|
||||||
.args(["-u", "postgres", "/run/current-system/sw/bin/psql", "-t", "-c", "SELECT count(*) FROM pg_stat_activity WHERE state = 'active';"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
if let Some(line) = stdout.lines().next() {
|
|
||||||
if let Ok(count) = line.trim().parse::<i32>() {
|
|
||||||
if count > 0 {
|
|
||||||
return Some(format!("{} connections", count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_mysql_connections(&self) -> Option<String> {
|
|
||||||
// Try mysql command first
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/mysql")
|
|
||||||
.args(["-e", "SHOW PROCESSLIST;"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1); // Subtract header line
|
|
||||||
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: check MySQL unix socket connections (more common than TCP)
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-x", "state", "connected", "src", "*mysql*"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also try TCP port 3306 as final fallback
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :3306"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_running_as_root(&self) -> bool {
|
|
||||||
std::env::var("USER").unwrap_or_default() == "root" ||
|
|
||||||
std::env::var("UID").unwrap_or_default() == "0"
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn measure_site_latency(&self, site_name: &str) -> (Option<f32>, bool) {
|
|
||||||
// Returns (latency, is_healthy)
|
|
||||||
// Construct URL from site name
|
|
||||||
let url = if site_name.contains("localhost") || site_name.contains("127.0.0.1") {
|
|
||||||
format!("http://{}", site_name)
|
|
||||||
} else {
|
|
||||||
format!("https://{}", site_name)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Create HTTP client with short timeout
|
|
||||||
let client = match reqwest::Client::builder()
|
|
||||||
.timeout(Duration::from_secs(2))
|
|
||||||
.build()
|
|
||||||
{
|
|
||||||
Ok(client) => client,
|
|
||||||
Err(_) => return (None, false),
|
|
||||||
};
|
|
||||||
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
// Make GET request for better app compatibility (some apps don't handle HEAD properly)
|
|
||||||
match client.get(&url).send().await {
|
|
||||||
Ok(response) => {
|
|
||||||
let latency = start.elapsed().as_millis() as f32;
|
|
||||||
let is_healthy = response.status().is_success() || response.status().is_redirection();
|
|
||||||
(Some(latency), is_healthy)
|
|
||||||
}
|
|
||||||
Err(_) => {
|
|
||||||
// Connection failed, no latency measurement, not healthy
|
|
||||||
(None, false)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_nginx_sites(&self) -> Option<Vec<String>> {
|
|
||||||
|
|
||||||
// Get the actual nginx config file path from systemd (NixOS uses custom config)
|
|
||||||
let config_path = match self.get_nginx_config_from_systemd().await {
|
|
||||||
Some(path) => path,
|
|
||||||
None => {
|
|
||||||
// Fallback to default nginx -T
|
|
||||||
let mut cmd = if self.is_running_as_root() {
|
|
||||||
Command::new("/run/current-system/sw/bin/nginx")
|
|
||||||
} else {
|
|
||||||
let mut cmd = Command::new("sudo");
|
|
||||||
cmd.arg("/run/current-system/sw/bin/nginx");
|
|
||||||
cmd
|
|
||||||
};
|
|
||||||
|
|
||||||
match cmd
|
|
||||||
.args(["-T"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(output) => {
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let config = String::from_utf8_lossy(&output.stdout);
|
|
||||||
return self.parse_nginx_config(&config).await;
|
|
||||||
}
|
|
||||||
Err(_) => {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Use the specific config file
|
|
||||||
let mut cmd = if self.is_running_as_root() {
|
|
||||||
Command::new("/run/current-system/sw/bin/nginx")
|
|
||||||
} else {
|
|
||||||
let mut cmd = Command::new("sudo");
|
|
||||||
cmd.arg("/run/current-system/sw/bin/nginx");
|
|
||||||
cmd
|
|
||||||
};
|
|
||||||
|
|
||||||
let output = match cmd
|
|
||||||
.args(["-T", "-c", &config_path])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(output) => output,
|
|
||||||
Err(_) => {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let config = String::from_utf8_lossy(&output.stdout);
|
|
||||||
|
|
||||||
self.parse_nginx_config(&config).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_nginx_config_from_systemd(&self) -> Option<String> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["show", "nginx", "--property=ExecStart", "--no-pager"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
// Parse ExecStart to extract -c config path
|
|
||||||
for line in stdout.lines() {
|
|
||||||
if line.starts_with("ExecStart=") {
|
|
||||||
// Handle both traditional and NixOS systemd formats
|
|
||||||
// Traditional: ExecStart=/path/nginx -c /config
|
|
||||||
// NixOS: ExecStart={ path=...; argv[]=...nginx -c /config; ... }
|
|
||||||
|
|
||||||
if let Some(c_index) = line.find(" -c ") {
|
|
||||||
let after_c = &line[c_index + 4..];
|
|
||||||
// Find the end of the config path
|
|
||||||
let end_pos = after_c.find(' ')
|
|
||||||
.or_else(|| after_c.find(" ;")) // NixOS format ends with " ;"
|
|
||||||
.unwrap_or(after_c.len());
|
|
||||||
|
|
||||||
let config_path = after_c[..end_pos].trim();
|
|
||||||
return Some(config_path.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn parse_nginx_config(&self, config: &str) -> Option<Vec<String>> {
|
|
||||||
let mut sites = Vec::new();
|
|
||||||
let lines: Vec<&str> = config.lines().collect();
|
|
||||||
let mut i = 0;
|
|
||||||
|
|
||||||
while i < lines.len() {
|
|
||||||
let trimmed = lines[i].trim();
|
|
||||||
|
|
||||||
// Look for server blocks
|
|
||||||
if trimmed == "server {" {
|
|
||||||
if let Some(hostname) = self.parse_server_block(&lines, &mut i) {
|
|
||||||
sites.push(hostname);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Return all sites from nginx config (monitor all, regardless of current status)
|
|
||||||
if sites.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(sites)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
|
|
||||||
let mut server_names = Vec::new();
|
|
||||||
let mut has_redirect = false;
|
|
||||||
let mut i = *start_index + 1;
|
|
||||||
let mut brace_count = 1;
|
|
||||||
|
|
||||||
// Parse until we close the server block
|
|
||||||
while i < lines.len() && brace_count > 0 {
|
|
||||||
let trimmed = lines[i].trim();
|
|
||||||
|
|
||||||
// Track braces
|
|
||||||
brace_count += trimmed.matches('{').count();
|
|
||||||
brace_count -= trimmed.matches('}').count();
|
|
||||||
|
|
||||||
// Extract server_name
|
|
||||||
if trimmed.starts_with("server_name") {
|
|
||||||
if let Some(names_part) = trimmed.strip_prefix("server_name") {
|
|
||||||
let names_clean = names_part.trim().trim_end_matches(';');
|
|
||||||
for name in names_clean.split_whitespace() {
|
|
||||||
if name != "_" && !name.is_empty() && name.contains('.') && !name.starts_with('$') {
|
|
||||||
server_names.push(name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if this server block is just a redirect
|
|
||||||
if trimmed.starts_with("return") && trimmed.contains("301") {
|
|
||||||
has_redirect = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
*start_index = i - 1;
|
|
||||||
|
|
||||||
// Only return hostnames that are not redirects and have actual content
|
|
||||||
if !server_names.is_empty() && !has_redirect {
|
|
||||||
Some(server_names[0].clone())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn get_nginx_description(&self) -> Option<String> {
|
|
||||||
// Get site count and active connections
|
|
||||||
let sites = self.get_nginx_sites().await?;
|
|
||||||
let site_count = sites.len();
|
|
||||||
|
|
||||||
// Get active connections
|
|
||||||
let connections = self.get_web_server_connections().await;
|
|
||||||
|
|
||||||
if let Some(conn_info) = connections {
|
|
||||||
Some(format!("{} sites, {}", site_count, conn_info))
|
|
||||||
} else {
|
|
||||||
Some(format!("{} sites", site_count))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_redis_info(&self) -> Option<String> {
|
|
||||||
// Try redis-cli first
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/redis-cli")
|
|
||||||
.args(["info", "clients"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
for line in stdout.lines() {
|
|
||||||
if line.starts_with("connected_clients:") {
|
|
||||||
if let Some(count) = line.split(':').nth(1) {
|
|
||||||
if let Ok(client_count) = count.trim().parse::<i32>() {
|
|
||||||
return Some(format!("{} connections", client_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: check for redis connections on port 6379
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :6379"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn get_immich_info(&self) -> Option<String> {
|
|
||||||
// Check HTTP connections - Immich runs on port 8084 (from nginx proxy config)
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :8084"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_vaultwarden_info(&self) -> Option<String> {
|
|
||||||
// Check vaultwarden connections on port 8222 (from nginx proxy config)
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :8222"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_unifi_info(&self) -> Option<String> {
|
|
||||||
// Check UniFi connections on port 8080 (TCP)
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :8080"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_mosquitto_info(&self) -> Option<String> {
|
|
||||||
// Check for active connections using netstat on MQTT ports
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "sport", "= :1883", "or", "sport", "= :8883"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_docker_registry_info(&self) -> Option<String> {
|
|
||||||
// Check Docker registry connections on port 5000 (from nginx proxy config)
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :5000"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_haasp_webgrid_info(&self) -> Option<String> {
|
|
||||||
// Check HAASP webgrid connections on port 8081
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ss")
|
|
||||||
.args(["-tn", "state", "established", "dport", "= :8081"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let connection_count = stdout.lines().count().saturating_sub(1);
|
|
||||||
if connection_count > 0 {
|
|
||||||
return Some(format!("{} connections", connection_count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Collector for ServiceCollector {
|
|
||||||
fn name(&self) -> &str {
|
|
||||||
"service"
|
|
||||||
}
|
|
||||||
|
|
||||||
fn agent_type(&self) -> AgentType {
|
|
||||||
AgentType::Service
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect_interval(&self) -> Duration {
|
|
||||||
self.interval
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
|
||||||
let mut services = Vec::new();
|
|
||||||
let mut healthy = 0;
|
|
||||||
let mut degraded = 0;
|
|
||||||
let mut failed = 0;
|
|
||||||
let mut total_memory_used = 0.0;
|
|
||||||
let mut total_memory_quota = 0.0;
|
|
||||||
let mut total_disk_used = 0.0;
|
|
||||||
|
|
||||||
// Collect data from all configured services
|
|
||||||
for service in &self.services {
|
|
||||||
match self.get_service_status(service).await {
|
|
||||||
Ok(service_data) => {
|
|
||||||
match service_data.status {
|
|
||||||
ServiceStatus::Running => healthy += 1,
|
|
||||||
ServiceStatus::Degraded | ServiceStatus::Restarting => degraded += 1,
|
|
||||||
ServiceStatus::Stopped => failed += 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
total_memory_used += service_data.memory_used_mb;
|
|
||||||
if service_data.memory_quota_mb > 0.0 {
|
|
||||||
total_memory_quota += service_data.memory_quota_mb;
|
|
||||||
}
|
|
||||||
total_disk_used += service_data.disk_used_gb;
|
|
||||||
|
|
||||||
// Handle nginx specially - create sub-services for sites
|
|
||||||
if service == "nginx" && matches!(service_data.status, ServiceStatus::Running) {
|
|
||||||
// Clear nginx description - sites will become individual sub-services
|
|
||||||
let mut nginx_service = service_data;
|
|
||||||
nginx_service.description = None;
|
|
||||||
services.push(nginx_service);
|
|
||||||
|
|
||||||
// Add nginx sites as individual sub-services
|
|
||||||
if let Some(sites) = self.get_nginx_sites().await {
|
|
||||||
for site in sites.iter() {
|
|
||||||
// Measure latency and health for this site
|
|
||||||
let (latency, is_healthy) = self.measure_site_latency(site).await;
|
|
||||||
|
|
||||||
// Determine status and description based on latency and health
|
|
||||||
let (site_status, site_description) = match (latency, is_healthy) {
|
|
||||||
(Some(_ms), true) => (ServiceStatus::Running, None),
|
|
||||||
(Some(_ms), false) => (ServiceStatus::Stopped, None), // Show error status but no description
|
|
||||||
(None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Update counters based on site status
|
|
||||||
match site_status {
|
|
||||||
ServiceStatus::Running => healthy += 1,
|
|
||||||
ServiceStatus::Stopped => failed += 1,
|
|
||||||
_ => degraded += 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
services.push(ServiceData {
|
|
||||||
name: site.clone(),
|
|
||||||
status: site_status,
|
|
||||||
memory_used_mb: 0.0,
|
|
||||||
memory_quota_mb: 0.0,
|
|
||||||
cpu_percent: 0.0,
|
|
||||||
sandbox_limit: None,
|
|
||||||
disk_used_gb: 0.0,
|
|
||||||
disk_quota_gb: 0.0,
|
|
||||||
is_sandboxed: false, // Sub-services inherit parent sandbox status
|
|
||||||
is_sandbox_excluded: false,
|
|
||||||
description: site_description,
|
|
||||||
sub_service: Some("nginx".to_string()),
|
|
||||||
latency_ms: latency,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Handle docker specially - create sub-services for containers
|
|
||||||
else if service == "docker" && matches!(service_data.status, ServiceStatus::Running) {
|
|
||||||
// Clear docker description - containers will become individual sub-services
|
|
||||||
let mut docker_service = service_data;
|
|
||||||
docker_service.description = None;
|
|
||||||
services.push(docker_service);
|
|
||||||
|
|
||||||
// Add docker containers as individual sub-services
|
|
||||||
if let Some(containers) = self.get_docker_containers().await {
|
|
||||||
for container in containers.iter() {
|
|
||||||
services.push(ServiceData {
|
|
||||||
name: container.clone(),
|
|
||||||
status: ServiceStatus::Running, // Assume containers are running if docker is running
|
|
||||||
memory_used_mb: 0.0,
|
|
||||||
memory_quota_mb: 0.0,
|
|
||||||
cpu_percent: 0.0,
|
|
||||||
sandbox_limit: None,
|
|
||||||
disk_used_gb: 0.0,
|
|
||||||
disk_quota_gb: 0.0,
|
|
||||||
is_sandboxed: true, // Docker containers are inherently sandboxed
|
|
||||||
is_sandbox_excluded: false,
|
|
||||||
description: None,
|
|
||||||
sub_service: Some("docker".to_string()),
|
|
||||||
latency_ms: None,
|
|
||||||
});
|
|
||||||
healthy += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
services.push(service_data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
failed += 1;
|
|
||||||
// Add a placeholder service entry for failed collection
|
|
||||||
services.push(ServiceData {
|
|
||||||
name: service.clone(),
|
|
||||||
status: ServiceStatus::Stopped,
|
|
||||||
memory_used_mb: 0.0,
|
|
||||||
memory_quota_mb: 0.0,
|
|
||||||
cpu_percent: 0.0,
|
|
||||||
sandbox_limit: None,
|
|
||||||
disk_used_gb: 0.0,
|
|
||||||
disk_quota_gb: 0.0,
|
|
||||||
is_sandboxed: false, // Unknown for failed services
|
|
||||||
is_sandbox_excluded: false,
|
|
||||||
description: None,
|
|
||||||
sub_service: None,
|
|
||||||
latency_ms: None,
|
|
||||||
});
|
|
||||||
tracing::warn!("Failed to collect metrics for service {}: {}", service, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage {
|
|
||||||
total_capacity_gb: 0.0,
|
|
||||||
used_gb: 0.0,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Memory quotas remain as detected from systemd - don't default to system total
|
|
||||||
// Services without memory limits will show quota = 0.0 and display usage only
|
|
||||||
|
|
||||||
// Calculate overall services status
|
|
||||||
let services_status = self.determine_services_status(healthy, degraded, failed);
|
|
||||||
|
|
||||||
let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await;
|
|
||||||
|
|
||||||
// If no specific quotas are set, use a default value
|
|
||||||
if total_memory_quota == 0.0 {
|
|
||||||
total_memory_quota = 8192.0; // Default 8GB for quota calculation
|
|
||||||
}
|
|
||||||
|
|
||||||
let service_metrics = json!({
|
|
||||||
"summary": {
|
|
||||||
"healthy": healthy,
|
|
||||||
"degraded": degraded,
|
|
||||||
"failed": failed,
|
|
||||||
"services_status": services_status,
|
|
||||||
"memory_used_mb": total_memory_used,
|
|
||||||
"memory_quota_mb": total_memory_quota,
|
|
||||||
"disk_used_gb": total_disk_used,
|
|
||||||
"disk_total_gb": total_disk_used, // For services, total = used (no quota concept)
|
|
||||||
"gpu_load_percent": gpu_load_percent,
|
|
||||||
"gpu_temp_c": gpu_temp_c,
|
|
||||||
},
|
|
||||||
"services": services,
|
|
||||||
"timestamp": Utc::now()
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(CollectorOutput {
|
|
||||||
agent_type: AgentType::Service,
|
|
||||||
data: service_metrics,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
|
||||||
struct ServiceData {
|
|
||||||
name: String,
|
|
||||||
status: ServiceStatus,
|
|
||||||
memory_used_mb: f32,
|
|
||||||
memory_quota_mb: f32,
|
|
||||||
cpu_percent: f32,
|
|
||||||
sandbox_limit: Option<f32>,
|
|
||||||
disk_used_gb: f32,
|
|
||||||
disk_quota_gb: f32,
|
|
||||||
is_sandboxed: bool,
|
|
||||||
is_sandbox_excluded: bool,
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
description: Option<Vec<String>>,
|
|
||||||
#[serde(default)]
|
|
||||||
sub_service: Option<String>,
|
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
||||||
latency_ms: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
|
||||||
enum ServiceStatus {
|
|
||||||
Running,
|
|
||||||
Degraded,
|
|
||||||
Restarting,
|
|
||||||
Stopped,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
struct DiskUsage {
|
|
||||||
total_capacity_gb: f32,
|
|
||||||
used_gb: f32,
|
|
||||||
}
|
|
||||||
@@ -1,483 +0,0 @@
|
|||||||
use async_trait::async_trait;
|
|
||||||
use chrono::Utc;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use serde_json::json;
|
|
||||||
use std::io::ErrorKind;
|
|
||||||
use std::process::Stdio;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tokio::process::Command;
|
|
||||||
use tokio::time::timeout;
|
|
||||||
|
|
||||||
use super::{AgentType, Collector, CollectorError, CollectorOutput};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct SmartCollector {
|
|
||||||
pub interval: Duration,
|
|
||||||
pub devices: Vec<String>,
|
|
||||||
pub timeout_ms: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SmartCollector {
|
|
||||||
pub fn new(_enabled: bool, interval_ms: u64, devices: Vec<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
interval: Duration::from_millis(interval_ms),
|
|
||||||
devices,
|
|
||||||
timeout_ms: 30000, // 30 second timeout for smartctl
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn is_device_mounted(&self, device: &str) -> bool {
|
|
||||||
// Check if device is mounted by looking in /proc/mounts
|
|
||||||
if let Ok(mounts) = tokio::fs::read_to_string("/proc/mounts").await {
|
|
||||||
for line in mounts.lines() {
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if parts.len() >= 2 {
|
|
||||||
// Check if this mount point references our device
|
|
||||||
// Handle both /dev/nvme0n1p1 style and /dev/sda1 style
|
|
||||||
if parts[0].starts_with(&format!("/dev/{}", device)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_smart_data(&self, device: &str) -> Result<SmartDeviceData, CollectorError> {
|
|
||||||
let timeout_duration = Duration::from_millis(self.timeout_ms);
|
|
||||||
|
|
||||||
let command_result = timeout(
|
|
||||||
timeout_duration,
|
|
||||||
Command::new("sudo")
|
|
||||||
.args(["/run/current-system/sw/bin/smartctl", "-a", "-j", &format!("/dev/{}", device)])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(|_| CollectorError::Timeout {
|
|
||||||
duration_ms: self.timeout_ms,
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let output = command_result.map_err(|e| match e.kind() {
|
|
||||||
ErrorKind::NotFound => CollectorError::ExternalDependency {
|
|
||||||
dependency: "smartctl".to_string(),
|
|
||||||
message: e.to_string(),
|
|
||||||
},
|
|
||||||
ErrorKind::PermissionDenied => CollectorError::PermissionDenied {
|
|
||||||
message: e.to_string(),
|
|
||||||
},
|
|
||||||
_ => CollectorError::CommandFailed {
|
|
||||||
command: format!("smartctl -a -j /dev/{}", device),
|
|
||||||
message: e.to_string(),
|
|
||||||
},
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
let stderr_lower = stderr.to_lowercase();
|
|
||||||
|
|
||||||
if stderr_lower.contains("permission denied") {
|
|
||||||
return Err(CollectorError::PermissionDenied {
|
|
||||||
message: stderr.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if stderr_lower.contains("no such device") || stderr_lower.contains("cannot open") {
|
|
||||||
return Err(CollectorError::DeviceNotFound {
|
|
||||||
device: device.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: format!("smartctl -a -j /dev/{}", device),
|
|
||||||
message: stderr.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let smart_output: SmartCtlOutput =
|
|
||||||
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse smartctl output for {}: {}", device, e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(SmartDeviceData::from_smartctl_output(device, smart_output))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_drive_usage(
|
|
||||||
&self,
|
|
||||||
device: &str,
|
|
||||||
) -> Result<(Option<f32>, Option<f32>), CollectorError> {
|
|
||||||
// Get capacity first
|
|
||||||
let capacity = match self.get_drive_capacity(device).await {
|
|
||||||
Ok(cap) => Some(cap),
|
|
||||||
Err(_) => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Try to get usage information
|
|
||||||
// For simplicity, we'll use the root filesystem usage for now
|
|
||||||
// In the future, this could be enhanced to map drives to specific mount points
|
|
||||||
let usage = if device.contains("nvme0n1") || device.contains("sda") {
|
|
||||||
// This is likely the main system drive, use root filesystem usage
|
|
||||||
match self.get_disk_usage().await {
|
|
||||||
Ok(disk_usage) => Some(disk_usage.used_gb),
|
|
||||||
Err(_) => None,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// For other drives, we don't have usage info yet
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok((capacity, usage))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_drive_capacity(&self, device: &str) -> Result<f32, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/lsblk")
|
|
||||||
.args(["-J", "-o", "NAME,SIZE", &format!("/dev/{}", device)])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: format!("lsblk -J -o NAME,SIZE /dev/{}", device),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: format!("lsblk -J -o NAME,SIZE /dev/{}", device),
|
|
||||||
message: stderr.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let lsblk_output: serde_json::Value =
|
|
||||||
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse lsblk JSON: {}", e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// Extract size from the first blockdevice
|
|
||||||
if let Some(blockdevices) = lsblk_output["blockdevices"].as_array() {
|
|
||||||
if let Some(device_info) = blockdevices.first() {
|
|
||||||
if let Some(size_str) = device_info["size"].as_str() {
|
|
||||||
return self.parse_lsblk_size(size_str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: format!("No size information found for device {}", device),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_lsblk_size(&self, size_str: &str) -> Result<f32, CollectorError> {
|
|
||||||
// Parse sizes like "953,9G", "1T", "512M"
|
|
||||||
let size_str = size_str.replace(',', "."); // Handle European decimal separator
|
|
||||||
|
|
||||||
if let Some(pos) = size_str.find(|c: char| c.is_alphabetic()) {
|
|
||||||
let (number_part, unit_part) = size_str.split_at(pos);
|
|
||||||
let number: f32 = number_part
|
|
||||||
.parse()
|
|
||||||
.map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse size number '{}': {}", number_part, e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let multiplier = match unit_part.to_uppercase().as_str() {
|
|
||||||
"T" | "TB" => 1024.0,
|
|
||||||
"G" | "GB" => 1.0,
|
|
||||||
"M" | "MB" => 1.0 / 1024.0,
|
|
||||||
"K" | "KB" => 1.0 / (1024.0 * 1024.0),
|
|
||||||
_ => {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: format!("Unknown size unit: {}", unit_part),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(number * multiplier)
|
|
||||||
} else {
|
|
||||||
Err(CollectorError::ParseError {
|
|
||||||
message: format!("Invalid size format: {}", size_str),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/df")
|
|
||||||
.args(["-BG", "--output=size,used,avail", "/"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: "df -BG --output=size,used,avail /".to_string(),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: "df -BG --output=size,used,avail /".to_string(),
|
|
||||||
message: stderr.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let lines: Vec<&str> = stdout.lines().collect();
|
|
||||||
|
|
||||||
if lines.len() < 2 {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: "Unexpected df output format".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip header line, parse data line
|
|
||||||
let data_line = lines[1].trim();
|
|
||||||
let parts: Vec<&str> = data_line.split_whitespace().collect();
|
|
||||||
|
|
||||||
if parts.len() < 3 {
|
|
||||||
return Err(CollectorError::ParseError {
|
|
||||||
message: format!("Unexpected df data format: {}", data_line),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let parse_size = |s: &str| -> Result<f32, CollectorError> {
|
|
||||||
s.trim_end_matches('G')
|
|
||||||
.parse::<f32>()
|
|
||||||
.map_err(|e| CollectorError::ParseError {
|
|
||||||
message: format!("Failed to parse disk size '{}': {}", s, e),
|
|
||||||
})
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(DiskUsage {
|
|
||||||
total_gb: parse_size(parts[0])?,
|
|
||||||
used_gb: parse_size(parts[1])?,
|
|
||||||
available_gb: parse_size(parts[2])?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Collector for SmartCollector {
|
|
||||||
fn name(&self) -> &str {
|
|
||||||
"smart"
|
|
||||||
}
|
|
||||||
|
|
||||||
fn agent_type(&self) -> AgentType {
|
|
||||||
AgentType::Smart
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect_interval(&self) -> Duration {
|
|
||||||
self.interval
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
|
||||||
let mut drives = Vec::new();
|
|
||||||
let mut issues = Vec::new();
|
|
||||||
let mut healthy = 0;
|
|
||||||
let mut warning = 0;
|
|
||||||
let mut critical = 0;
|
|
||||||
|
|
||||||
// Collect data from all configured devices
|
|
||||||
for device in &self.devices {
|
|
||||||
// Skip unmounted devices
|
|
||||||
if !self.is_device_mounted(device).await {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
match self.get_smart_data(device).await {
|
|
||||||
Ok(mut drive_data) => {
|
|
||||||
// Try to get capacity and usage for this drive
|
|
||||||
if let Ok((capacity, usage)) = self.get_drive_usage(device).await {
|
|
||||||
drive_data.capacity_gb = capacity;
|
|
||||||
drive_data.used_gb = usage;
|
|
||||||
}
|
|
||||||
match drive_data.health_status.as_str() {
|
|
||||||
"PASSED" => healthy += 1,
|
|
||||||
"FAILED" => {
|
|
||||||
critical += 1;
|
|
||||||
issues.push(format!("{}: SMART status FAILED", device));
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
warning += 1;
|
|
||||||
issues.push(format!("{}: Unknown SMART status", device));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
drives.push(drive_data);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warning += 1;
|
|
||||||
issues.push(format!("{}: {}", device, e));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get disk usage information
|
|
||||||
let disk_usage = self.get_disk_usage().await?;
|
|
||||||
|
|
||||||
let status = if critical > 0 {
|
|
||||||
"critical"
|
|
||||||
} else if warning > 0 {
|
|
||||||
"warning"
|
|
||||||
} else {
|
|
||||||
"ok"
|
|
||||||
};
|
|
||||||
|
|
||||||
let smart_metrics = json!({
|
|
||||||
"status": status,
|
|
||||||
"drives": drives,
|
|
||||||
"summary": {
|
|
||||||
"healthy": healthy,
|
|
||||||
"warning": warning,
|
|
||||||
"critical": critical,
|
|
||||||
"capacity_total_gb": disk_usage.total_gb,
|
|
||||||
"capacity_used_gb": disk_usage.used_gb,
|
|
||||||
"capacity_available_gb": disk_usage.available_gb
|
|
||||||
},
|
|
||||||
"issues": issues,
|
|
||||||
"timestamp": Utc::now()
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(CollectorOutput {
|
|
||||||
agent_type: AgentType::Smart,
|
|
||||||
data: smart_metrics,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
|
||||||
struct SmartDeviceData {
|
|
||||||
name: String,
|
|
||||||
temperature_c: f32,
|
|
||||||
wear_level: f32,
|
|
||||||
power_on_hours: u64,
|
|
||||||
available_spare: f32,
|
|
||||||
health_status: String,
|
|
||||||
capacity_gb: Option<f32>,
|
|
||||||
used_gb: Option<f32>,
|
|
||||||
#[serde(default)]
|
|
||||||
description: Option<Vec<String>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SmartDeviceData {
|
|
||||||
fn from_smartctl_output(device: &str, output: SmartCtlOutput) -> Self {
|
|
||||||
let temperature_c = output.temperature.and_then(|t| t.current).unwrap_or(0.0);
|
|
||||||
|
|
||||||
let wear_level = output
|
|
||||||
.nvme_smart_health_information_log
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|nvme| nvme.percentage_used)
|
|
||||||
.unwrap_or(0.0);
|
|
||||||
|
|
||||||
let power_on_hours = output.power_on_time.and_then(|p| p.hours).unwrap_or(0);
|
|
||||||
|
|
||||||
let available_spare = output
|
|
||||||
.nvme_smart_health_information_log
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|nvme| nvme.available_spare)
|
|
||||||
.unwrap_or(100.0);
|
|
||||||
|
|
||||||
let health_status = output
|
|
||||||
.smart_status
|
|
||||||
.and_then(|s| s.passed)
|
|
||||||
.map(|passed| {
|
|
||||||
if passed {
|
|
||||||
"PASSED".to_string()
|
|
||||||
} else {
|
|
||||||
"FAILED".to_string()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.unwrap_or_else(|| "UNKNOWN".to_string());
|
|
||||||
|
|
||||||
// Build SMART description with key metrics
|
|
||||||
let mut smart_details = Vec::new();
|
|
||||||
if available_spare > 0.0 {
|
|
||||||
smart_details.push(format!("Spare: {}%", available_spare as u32));
|
|
||||||
}
|
|
||||||
if power_on_hours > 0 {
|
|
||||||
smart_details.push(format!("Hours: {}", power_on_hours));
|
|
||||||
}
|
|
||||||
|
|
||||||
let description = if smart_details.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(vec![smart_details.join(", ")])
|
|
||||||
};
|
|
||||||
|
|
||||||
Self {
|
|
||||||
name: device.to_string(),
|
|
||||||
temperature_c,
|
|
||||||
wear_level,
|
|
||||||
power_on_hours,
|
|
||||||
available_spare,
|
|
||||||
health_status,
|
|
||||||
capacity_gb: None, // Will be set later by the collector
|
|
||||||
used_gb: None, // Will be set later by the collector
|
|
||||||
description,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct DiskUsage {
|
|
||||||
total_gb: f32,
|
|
||||||
used_gb: f32,
|
|
||||||
available_gb: f32,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Minimal smartctl JSON output structure - only the fields we need
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct SmartCtlOutput {
|
|
||||||
temperature: Option<Temperature>,
|
|
||||||
power_on_time: Option<PowerOnTime>,
|
|
||||||
smart_status: Option<SmartStatus>,
|
|
||||||
nvme_smart_health_information_log: Option<NvmeSmartLog>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct Temperature {
|
|
||||||
current: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct PowerOnTime {
|
|
||||||
hours: Option<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct SmartStatus {
|
|
||||||
passed: Option<bool>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct NvmeSmartLog {
|
|
||||||
percentage_used: Option<f32>,
|
|
||||||
available_spare: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_parse_lsblk_size() {
|
|
||||||
let collector = SmartCollector::new(true, 5000, vec![]);
|
|
||||||
|
|
||||||
// Test gigabyte sizes
|
|
||||||
assert!((collector.parse_lsblk_size("953,9G").unwrap() - 953.9).abs() < 0.1);
|
|
||||||
assert!((collector.parse_lsblk_size("1G").unwrap() - 1.0).abs() < 0.1);
|
|
||||||
|
|
||||||
// Test terabyte sizes
|
|
||||||
assert!((collector.parse_lsblk_size("1T").unwrap() - 1024.0).abs() < 0.1);
|
|
||||||
assert!((collector.parse_lsblk_size("2,5T").unwrap() - 2560.0).abs() < 0.1);
|
|
||||||
|
|
||||||
// Test megabyte sizes
|
|
||||||
assert!((collector.parse_lsblk_size("512M").unwrap() - 0.5).abs() < 0.1);
|
|
||||||
|
|
||||||
// Test error cases
|
|
||||||
assert!(collector.parse_lsblk_size("invalid").is_err());
|
|
||||||
assert!(collector.parse_lsblk_size("1X").is_err());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,409 +0,0 @@
|
|||||||
use async_trait::async_trait;
|
|
||||||
use serde_json::json;
|
|
||||||
use std::time::Duration;
|
|
||||||
use tokio::fs;
|
|
||||||
use tokio::process::Command;
|
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
use super::{Collector, CollectorError, CollectorOutput, AgentType};
|
|
||||||
|
|
||||||
pub struct SystemCollector {
|
|
||||||
enabled: bool,
|
|
||||||
interval: Duration,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SystemCollector {
|
|
||||||
pub fn new(enabled: bool, interval_ms: u64) -> Self {
|
|
||||||
Self {
|
|
||||||
enabled,
|
|
||||||
interval: Duration::from_millis(interval_ms),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/uptime")
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: "uptime".to_string(),
|
|
||||||
message: e.to_string()
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let uptime_str = String::from_utf8_lossy(&output.stdout);
|
|
||||||
|
|
||||||
// Parse load averages from uptime output
|
|
||||||
// Format with comma decimals: "... load average: 3,30, 3,17, 2,84"
|
|
||||||
if let Some(load_part) = uptime_str.split("load average:").nth(1) {
|
|
||||||
// Use regex or careful parsing for comma decimal separator locale
|
|
||||||
let load_str = load_part.trim();
|
|
||||||
// Split on ", " to separate the three load values
|
|
||||||
let loads: Vec<&str> = load_str.split(", ").collect();
|
|
||||||
if loads.len() >= 3 {
|
|
||||||
let load_1 = loads[0].trim().replace(',', ".").parse::<f32>()
|
|
||||||
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 1min load".to_string() })?;
|
|
||||||
let load_5 = loads[1].trim().replace(',', ".").parse::<f32>()
|
|
||||||
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 5min load".to_string() })?;
|
|
||||||
let load_15 = loads[2].trim().replace(',', ".").parse::<f32>()
|
|
||||||
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 15min load".to_string() })?;
|
|
||||||
|
|
||||||
return Ok((load_1, load_5, load_15));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(CollectorError::ParseError { message: "Failed to parse load averages".to_string() })
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_cpu_temperature(&self) -> Option<f32> {
|
|
||||||
// Try to find CPU-specific thermal zones first (x86_pkg_temp, coretemp, etc.)
|
|
||||||
for i in 0..10 {
|
|
||||||
let type_path = format!("/sys/class/thermal/thermal_zone{}/type", i);
|
|
||||||
let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i);
|
|
||||||
|
|
||||||
if let (Ok(zone_type), Ok(temp_str)) = (
|
|
||||||
fs::read_to_string(&type_path).await,
|
|
||||||
fs::read_to_string(&temp_path).await,
|
|
||||||
) {
|
|
||||||
let zone_type = zone_type.trim();
|
|
||||||
if let Ok(temp_millic) = temp_str.trim().parse::<f32>() {
|
|
||||||
let temp_c = temp_millic / 1000.0;
|
|
||||||
// Look for reasonable temperatures first
|
|
||||||
if temp_c > 20.0 && temp_c < 150.0 {
|
|
||||||
// Prefer CPU package temperature zones
|
|
||||||
if zone_type == "x86_pkg_temp" || zone_type.contains("coretemp") {
|
|
||||||
debug!("Found CPU temperature: {}°C from {} ({})", temp_c, temp_path, zone_type);
|
|
||||||
return Some(temp_c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: try any reasonable temperature if no CPU-specific zone found
|
|
||||||
for i in 0..10 {
|
|
||||||
let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i);
|
|
||||||
if let Ok(temp_str) = fs::read_to_string(&temp_path).await {
|
|
||||||
if let Ok(temp_millic) = temp_str.trim().parse::<f32>() {
|
|
||||||
let temp_c = temp_millic / 1000.0;
|
|
||||||
if temp_c > 20.0 && temp_c < 150.0 {
|
|
||||||
debug!("Found fallback temperature: {}°C from {}", temp_c, temp_path);
|
|
||||||
return Some(temp_c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_memory_info(&self) -> Result<(f32, f32), CollectorError> {
|
|
||||||
let meminfo = fs::read_to_string("/proc/meminfo")
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError { message: format!("Failed to read /proc/meminfo: {}", e) })?;
|
|
||||||
|
|
||||||
let mut total_kb = 0;
|
|
||||||
let mut available_kb = 0;
|
|
||||||
|
|
||||||
for line in meminfo.lines() {
|
|
||||||
if line.starts_with("MemTotal:") {
|
|
||||||
if let Some(value) = line.split_whitespace().nth(1) {
|
|
||||||
total_kb = value.parse::<u64>().unwrap_or(0);
|
|
||||||
}
|
|
||||||
} else if line.starts_with("MemAvailable:") {
|
|
||||||
if let Some(value) = line.split_whitespace().nth(1) {
|
|
||||||
available_kb = value.parse::<u64>().unwrap_or(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if total_kb == 0 {
|
|
||||||
return Err(CollectorError::ParseError { message: "Could not parse total memory".to_string() });
|
|
||||||
}
|
|
||||||
|
|
||||||
let total_mb = total_kb as f32 / 1024.0;
|
|
||||||
let used_mb = total_mb - (available_kb as f32 / 1024.0);
|
|
||||||
|
|
||||||
Ok((used_mb, total_mb))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_logged_in_users(&self) -> Option<Vec<String>> {
|
|
||||||
// Get currently logged-in users using 'who' command
|
|
||||||
let output = Command::new("who")
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
let who_output = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let mut users = Vec::new();
|
|
||||||
|
|
||||||
for line in who_output.lines() {
|
|
||||||
if let Some(username) = line.split_whitespace().next() {
|
|
||||||
if !username.is_empty() && !users.contains(&username.to_string()) {
|
|
||||||
users.push(username.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if users.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
users.sort();
|
|
||||||
Some(users)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_cpu_cstate_info(&self) -> Option<Vec<String>> {
|
|
||||||
// Read C-state information to show all sleep state distributions
|
|
||||||
let mut cstate_times: Vec<(String, u64)> = Vec::new();
|
|
||||||
let mut total_time = 0u64;
|
|
||||||
|
|
||||||
// Check if C-state information is available
|
|
||||||
if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await {
|
|
||||||
while let Ok(Some(entry)) = entries.next_entry().await {
|
|
||||||
let state_path = entry.path();
|
|
||||||
let name_path = state_path.join("name");
|
|
||||||
let time_path = state_path.join("time");
|
|
||||||
|
|
||||||
if let (Ok(name), Ok(time_str)) = (
|
|
||||||
fs::read_to_string(&name_path).await,
|
|
||||||
fs::read_to_string(&time_path).await
|
|
||||||
) {
|
|
||||||
let name = name.trim().to_string();
|
|
||||||
if let Ok(time) = time_str.trim().parse::<u64>() {
|
|
||||||
total_time += time;
|
|
||||||
cstate_times.push((name, time));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if total_time > 0 && !cstate_times.is_empty() {
|
|
||||||
// Sort by C-state order: POLL, C1, C1E, C3, C6, C7s, C8, C9, C10
|
|
||||||
cstate_times.sort_by(|a, b| {
|
|
||||||
let order_a = match a.0.as_str() {
|
|
||||||
"POLL" => 0,
|
|
||||||
"C1" => 1,
|
|
||||||
"C1E" => 2,
|
|
||||||
"C3" => 3,
|
|
||||||
"C6" => 4,
|
|
||||||
"C7s" => 5,
|
|
||||||
"C8" => 6,
|
|
||||||
"C9" => 7,
|
|
||||||
"C10" => 8,
|
|
||||||
_ => 99,
|
|
||||||
};
|
|
||||||
let order_b = match b.0.as_str() {
|
|
||||||
"POLL" => 0,
|
|
||||||
"C1" => 1,
|
|
||||||
"C1E" => 2,
|
|
||||||
"C3" => 3,
|
|
||||||
"C6" => 4,
|
|
||||||
"C7s" => 5,
|
|
||||||
"C8" => 6,
|
|
||||||
"C9" => 7,
|
|
||||||
"C10" => 8,
|
|
||||||
_ => 99,
|
|
||||||
};
|
|
||||||
order_a.cmp(&order_b)
|
|
||||||
});
|
|
||||||
|
|
||||||
// Format C-states as description lines (2 C-states per row)
|
|
||||||
let mut result = Vec::new();
|
|
||||||
let mut current_line = Vec::new();
|
|
||||||
|
|
||||||
for (name, time) in cstate_times {
|
|
||||||
let percent = (time as f32 / total_time as f32) * 100.0;
|
|
||||||
if percent >= 0.1 { // Only show states with at least 0.1% time
|
|
||||||
current_line.push(format!("{}: {:.1}%", name, percent));
|
|
||||||
|
|
||||||
// Split into rows when we have 2 items
|
|
||||||
if current_line.len() == 2 {
|
|
||||||
result.push(current_line.join(", "));
|
|
||||||
current_line.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add remaining items as final line
|
|
||||||
if !current_line.is_empty() {
|
|
||||||
result.push(current_line.join(", "));
|
|
||||||
}
|
|
||||||
|
|
||||||
return Some(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_cpu_status(&self, cpu_load_5: f32) -> String {
|
|
||||||
if cpu_load_5 >= 10.0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if cpu_load_5 >= 9.0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else {
|
|
||||||
"ok".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_cpu_temp_status(&self, temp_c: f32) -> String {
|
|
||||||
if temp_c >= 100.0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if temp_c >= 100.0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else {
|
|
||||||
"ok".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn determine_memory_status(&self, usage_percent: f32) -> String {
|
|
||||||
if usage_percent >= 95.0 {
|
|
||||||
"critical".to_string()
|
|
||||||
} else if usage_percent >= 80.0 {
|
|
||||||
"warning".to_string()
|
|
||||||
} else {
|
|
||||||
"ok".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_top_cpu_process(&self) -> Option<String> {
|
|
||||||
// Get top CPU process using ps command
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ps")
|
|
||||||
.args(["aux", "--sort=-pcpu"])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
// Skip header line and get first process
|
|
||||||
for line in stdout.lines().skip(1) {
|
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if fields.len() >= 11 {
|
|
||||||
let cpu_percent = fields[2];
|
|
||||||
let command = fields[10];
|
|
||||||
// Skip kernel threads (in brackets) and low CPU processes
|
|
||||||
if !command.starts_with('[') && cpu_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
|
||||||
return Some(format!("{} {:.1}%", command, cpu_percent.parse::<f32>().unwrap_or(0.0)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_top_ram_process(&self) -> Option<String> {
|
|
||||||
// Get top RAM process using ps command
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/ps")
|
|
||||||
.args(["aux", "--sort=-rss"])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if output.status.success() {
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
// Skip header line and get first process
|
|
||||||
for line in stdout.lines().skip(1) {
|
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if fields.len() >= 11 {
|
|
||||||
let mem_percent = fields[3];
|
|
||||||
let command = fields[10];
|
|
||||||
// Skip kernel threads (in brackets) and low memory processes
|
|
||||||
if !command.starts_with('[') && mem_percent.parse::<f32>().unwrap_or(0.0) > 0.1 {
|
|
||||||
return Some(format!("{} {:.1}%", command, mem_percent.parse::<f32>().unwrap_or(0.0)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Collector for SystemCollector {
|
|
||||||
fn name(&self) -> &str {
|
|
||||||
"system"
|
|
||||||
}
|
|
||||||
|
|
||||||
fn agent_type(&self) -> AgentType {
|
|
||||||
AgentType::System
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect_interval(&self) -> Duration {
|
|
||||||
self.interval
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
|
||||||
if !self.enabled {
|
|
||||||
return Err(CollectorError::ConfigError { message: "SystemCollector disabled".to_string() });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get CPU load averages
|
|
||||||
let (cpu_load_1, cpu_load_5, cpu_load_15) = self.get_cpu_load().await?;
|
|
||||||
let cpu_status = self.determine_cpu_status(cpu_load_5);
|
|
||||||
|
|
||||||
// Get CPU temperature (optional)
|
|
||||||
let cpu_temp_c = self.get_cpu_temperature().await;
|
|
||||||
let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp));
|
|
||||||
|
|
||||||
// Get memory information
|
|
||||||
let (memory_used_mb, memory_total_mb) = self.get_memory_info().await?;
|
|
||||||
let memory_usage_percent = (memory_used_mb / memory_total_mb) * 100.0;
|
|
||||||
let memory_status = self.determine_memory_status(memory_usage_percent);
|
|
||||||
|
|
||||||
// Get C-state information (optional)
|
|
||||||
let cpu_cstate_info = self.get_cpu_cstate_info().await;
|
|
||||||
|
|
||||||
// Get logged-in users (optional)
|
|
||||||
let logged_in_users = self.get_logged_in_users().await;
|
|
||||||
|
|
||||||
// Get top processes
|
|
||||||
let top_cpu_process = self.get_top_cpu_process().await;
|
|
||||||
let top_ram_process = self.get_top_ram_process().await;
|
|
||||||
|
|
||||||
let mut system_metrics = json!({
|
|
||||||
"summary": {
|
|
||||||
"cpu_load_1": cpu_load_1,
|
|
||||||
"cpu_load_5": cpu_load_5,
|
|
||||||
"cpu_load_15": cpu_load_15,
|
|
||||||
"cpu_status": cpu_status,
|
|
||||||
"memory_used_mb": memory_used_mb,
|
|
||||||
"memory_total_mb": memory_total_mb,
|
|
||||||
"memory_usage_percent": memory_usage_percent,
|
|
||||||
"memory_status": memory_status,
|
|
||||||
},
|
|
||||||
"timestamp": chrono::Utc::now().timestamp() as u64,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Add optional metrics if available
|
|
||||||
if let Some(temp) = cpu_temp_c {
|
|
||||||
system_metrics["summary"]["cpu_temp_c"] = json!(temp);
|
|
||||||
if let Some(status) = cpu_temp_status {
|
|
||||||
system_metrics["summary"]["cpu_temp_status"] = json!(status);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(cstates) = cpu_cstate_info {
|
|
||||||
system_metrics["summary"]["cpu_cstate"] = json!(cstates);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(users) = logged_in_users {
|
|
||||||
system_metrics["summary"]["logged_in_users"] = json!(users);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(cpu_proc) = top_cpu_process {
|
|
||||||
system_metrics["summary"]["top_cpu_process"] = json!(cpu_proc);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(ram_proc) = top_ram_process {
|
|
||||||
system_metrics["summary"]["top_ram_process"] = json!(ram_proc);
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("System metrics collected: CPU load {:.2}, Memory {:.1}%",
|
|
||||||
cpu_load_5, memory_usage_percent);
|
|
||||||
|
|
||||||
Ok(CollectorOutput {
|
|
||||||
agent_type: AgentType::System,
|
|
||||||
data: system_metrics,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
884
agent/src/collectors/systemd.rs
Normal file
884
agent/src/collectors/systemd.rs
Normal file
@@ -0,0 +1,884 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||||
|
use std::process::Command;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{Collector, CollectorError};
|
||||||
|
use crate::config::SystemdConfig;
|
||||||
|
use crate::service_tracker::UserStoppedServiceTracker;
|
||||||
|
|
||||||
|
/// Systemd collector for monitoring systemd services
|
||||||
|
pub struct SystemdCollector {
|
||||||
|
/// Cached state with thread-safe interior mutability
|
||||||
|
state: RwLock<ServiceCacheState>,
|
||||||
|
/// Configuration for service monitoring
|
||||||
|
config: SystemdConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal state for service caching
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct ServiceCacheState {
|
||||||
|
/// Interesting services to monitor (cached after discovery)
|
||||||
|
monitored_services: Vec<String>,
|
||||||
|
/// Cached service status information from discovery
|
||||||
|
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||||
|
/// Last time services were discovered
|
||||||
|
last_discovery_time: Option<Instant>,
|
||||||
|
/// How often to rediscover services (5 minutes)
|
||||||
|
discovery_interval_seconds: u64,
|
||||||
|
/// Cached nginx site latency metrics
|
||||||
|
nginx_site_metrics: Vec<Metric>,
|
||||||
|
/// Last time nginx sites were checked
|
||||||
|
last_nginx_check_time: Option<Instant>,
|
||||||
|
/// How often to check nginx site latency (configurable)
|
||||||
|
nginx_check_interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cached service status information from systemctl list-units
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ServiceStatusInfo {
|
||||||
|
load_state: String,
|
||||||
|
active_state: String,
|
||||||
|
sub_state: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemdCollector {
|
||||||
|
pub fn new(config: SystemdConfig) -> Self {
|
||||||
|
Self {
|
||||||
|
state: RwLock::new(ServiceCacheState {
|
||||||
|
monitored_services: Vec::new(),
|
||||||
|
service_status_cache: std::collections::HashMap::new(),
|
||||||
|
last_discovery_time: None,
|
||||||
|
discovery_interval_seconds: config.interval_seconds,
|
||||||
|
nginx_site_metrics: Vec::new(),
|
||||||
|
last_nginx_check_time: None,
|
||||||
|
nginx_check_interval_seconds: config.nginx_check_interval_seconds,
|
||||||
|
}),
|
||||||
|
config,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get monitored services, discovering them if needed or cache is expired
|
||||||
|
fn get_monitored_services(&self) -> Result<Vec<String>> {
|
||||||
|
// Check if we need discovery without holding the lock
|
||||||
|
let needs_discovery = {
|
||||||
|
let state = self.state.read().unwrap();
|
||||||
|
match state.last_discovery_time {
|
||||||
|
None => true, // First time
|
||||||
|
Some(last_time) => {
|
||||||
|
let elapsed = last_time.elapsed().as_secs();
|
||||||
|
elapsed >= state.discovery_interval_seconds
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if needs_discovery {
|
||||||
|
debug!("Discovering systemd services (cache expired or first run)");
|
||||||
|
// Call discover_services_internal which doesn't update state
|
||||||
|
match self.discover_services_internal() {
|
||||||
|
Ok((services, status_cache)) => {
|
||||||
|
// Update state with discovered services in a separate scope
|
||||||
|
if let Ok(mut state) = self.state.write() {
|
||||||
|
state.monitored_services = services.clone();
|
||||||
|
state.service_status_cache = status_cache;
|
||||||
|
state.last_discovery_time = Some(Instant::now());
|
||||||
|
debug!(
|
||||||
|
"Auto-discovered {} services to monitor: {:?}",
|
||||||
|
state.monitored_services.len(),
|
||||||
|
state.monitored_services
|
||||||
|
);
|
||||||
|
return Ok(services);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to discover services, using cached list: {}", e);
|
||||||
|
// Continue with existing cached services if discovery fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return cached services
|
||||||
|
let state = self.state.read().unwrap();
|
||||||
|
Ok(state.monitored_services.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get nginx site metrics, checking them if cache is expired
|
||||||
|
fn get_nginx_site_metrics(&self) -> Vec<Metric> {
|
||||||
|
let mut state = self.state.write().unwrap();
|
||||||
|
|
||||||
|
// Check if we need to refresh nginx site metrics
|
||||||
|
let needs_refresh = match state.last_nginx_check_time {
|
||||||
|
None => true, // First time
|
||||||
|
Some(last_time) => {
|
||||||
|
let elapsed = last_time.elapsed().as_secs();
|
||||||
|
elapsed >= state.nginx_check_interval_seconds
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if needs_refresh {
|
||||||
|
// Only check nginx sites if nginx service is active
|
||||||
|
if state.monitored_services.iter().any(|s| s.contains("nginx")) {
|
||||||
|
debug!(
|
||||||
|
"Refreshing nginx site latency metrics (interval: {}s)",
|
||||||
|
state.nginx_check_interval_seconds
|
||||||
|
);
|
||||||
|
let fresh_metrics = self.get_nginx_sites();
|
||||||
|
state.nginx_site_metrics = fresh_metrics;
|
||||||
|
state.last_nginx_check_time = Some(Instant::now());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
state.nginx_site_metrics.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Auto-discover interesting services to monitor (internal version that doesn't update state)
|
||||||
|
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
||||||
|
debug!("Starting systemd service discovery with status caching");
|
||||||
|
|
||||||
|
// First: Get all service unit files (includes services that have never been started)
|
||||||
|
let unit_files_output = Command::new("systemctl")
|
||||||
|
.arg("list-unit-files")
|
||||||
|
.arg("--type=service")
|
||||||
|
.arg("--no-pager")
|
||||||
|
.arg("--plain")
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
if !unit_files_output.status.success() {
|
||||||
|
return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second: Get runtime status of all units
|
||||||
|
let units_status_output = Command::new("systemctl")
|
||||||
|
.arg("list-units")
|
||||||
|
.arg("--type=service")
|
||||||
|
.arg("--all")
|
||||||
|
.arg("--no-pager")
|
||||||
|
.arg("--plain")
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
if !units_status_output.status.success() {
|
||||||
|
return Err(anyhow::anyhow!("systemctl list-units command failed"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let unit_files_str = String::from_utf8(unit_files_output.stdout)?;
|
||||||
|
let units_status_str = String::from_utf8(units_status_output.stdout)?;
|
||||||
|
let mut services = Vec::new();
|
||||||
|
|
||||||
|
// Use configuration instead of hardcoded values
|
||||||
|
let excluded_services = &self.config.excluded_services;
|
||||||
|
let service_name_filters = &self.config.service_name_filters;
|
||||||
|
|
||||||
|
// Parse all service unit files to get complete service list
|
||||||
|
let mut all_service_names = std::collections::HashSet::new();
|
||||||
|
|
||||||
|
for line in unit_files_str.lines() {
|
||||||
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||||
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
|
all_service_names.insert(service_name.to_string());
|
||||||
|
debug!("Found service unit file: {}", service_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse runtime status for all units
|
||||||
|
let mut status_cache = std::collections::HashMap::new();
|
||||||
|
for line in units_status_str.lines() {
|
||||||
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||||
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
|
|
||||||
|
// Extract status information from systemctl list-units output
|
||||||
|
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||||
|
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||||
|
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||||
|
|
||||||
|
// Cache the status information
|
||||||
|
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||||
|
load_state: load_state.clone(),
|
||||||
|
active_state: active_state.clone(),
|
||||||
|
sub_state: sub_state.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
debug!("Got runtime status for service: {} (load:{}, active:{}, sub:{})", service_name, load_state, active_state, sub_state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For services found in unit files but not in runtime status, set default inactive status
|
||||||
|
for service_name in &all_service_names {
|
||||||
|
if !status_cache.contains_key(service_name) {
|
||||||
|
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||||
|
load_state: "not-loaded".to_string(),
|
||||||
|
active_state: "inactive".to_string(),
|
||||||
|
sub_state: "dead".to_string(),
|
||||||
|
});
|
||||||
|
debug!("Service {} found in unit files but not runtime - marked as inactive", service_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Now process all discovered services
|
||||||
|
for service_name in &all_service_names {
|
||||||
|
debug!("Processing service: '{}'", service_name);
|
||||||
|
|
||||||
|
// Skip excluded services first
|
||||||
|
let mut is_excluded = false;
|
||||||
|
for excluded in excluded_services {
|
||||||
|
if service_name.contains(excluded) {
|
||||||
|
debug!(
|
||||||
|
"EXCLUDING service '{}' because it matches pattern '{}'",
|
||||||
|
service_name, excluded
|
||||||
|
);
|
||||||
|
is_excluded = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_excluded {
|
||||||
|
debug!("Skipping excluded service: '{}'", service_name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this service matches our filter patterns (supports wildcards)
|
||||||
|
for pattern in service_name_filters {
|
||||||
|
if self.matches_pattern(service_name, pattern) {
|
||||||
|
debug!(
|
||||||
|
"INCLUDING service '{}' because it matches pattern '{}'",
|
||||||
|
service_name, pattern
|
||||||
|
);
|
||||||
|
services.push(service_name.to_string());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("Service discovery completed: found {} matching services: {:?}", services.len(), services);
|
||||||
|
if services.is_empty() {
|
||||||
|
debug!("No services found matching the configured filters - this may indicate a parsing issue");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((services, status_cache))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if service name matches pattern (supports wildcards like nginx*)
|
||||||
|
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
||||||
|
if pattern.contains('*') {
|
||||||
|
// Wildcard pattern matching
|
||||||
|
if pattern.ends_with('*') {
|
||||||
|
// Pattern like "nginx*" - match if service starts with "nginx"
|
||||||
|
let prefix = &pattern[..pattern.len() - 1];
|
||||||
|
service_name.starts_with(prefix)
|
||||||
|
} else if pattern.starts_with('*') {
|
||||||
|
// Pattern like "*backup" - match if service ends with "backup"
|
||||||
|
let suffix = &pattern[1..];
|
||||||
|
service_name.ends_with(suffix)
|
||||||
|
} else {
|
||||||
|
// Pattern like "nginx*backup" - simple glob matching
|
||||||
|
self.simple_glob_match(service_name, pattern)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Exact match (existing behavior)
|
||||||
|
service_name == pattern
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple glob pattern matching for patterns with * in middle
|
||||||
|
fn simple_glob_match(&self, text: &str, pattern: &str) -> bool {
|
||||||
|
let parts: Vec<&str> = pattern.split('*').collect();
|
||||||
|
if parts.is_empty() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut pos = 0;
|
||||||
|
for (i, part) in parts.iter().enumerate() {
|
||||||
|
if part.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if i == 0 {
|
||||||
|
// First part must match at start
|
||||||
|
if !text[pos..].starts_with(part) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
pos += part.len();
|
||||||
|
} else if i == parts.len() - 1 {
|
||||||
|
// Last part must match at end
|
||||||
|
return text[pos..].ends_with(part);
|
||||||
|
} else {
|
||||||
|
// Middle part must be found somewhere
|
||||||
|
if let Some(found_pos) = text[pos..].find(part) {
|
||||||
|
pos += found_pos + part.len();
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get service status from cache (if available) or fallback to systemctl
|
||||||
|
fn get_service_status(&self, service: &str) -> Result<(String, String)> {
|
||||||
|
// Try to get status from cache first
|
||||||
|
if let Ok(state) = self.state.read() {
|
||||||
|
if let Some(cached_info) = state.service_status_cache.get(service) {
|
||||||
|
let active_status = cached_info.active_state.clone();
|
||||||
|
let detailed_info = format!(
|
||||||
|
"LoadState={}\nActiveState={}\nSubState={}",
|
||||||
|
cached_info.load_state,
|
||||||
|
cached_info.active_state,
|
||||||
|
cached_info.sub_state
|
||||||
|
);
|
||||||
|
return Ok((active_status, detailed_info));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to systemctl if not in cache (shouldn't happen during normal operation)
|
||||||
|
debug!("Service '{}' not found in cache, falling back to systemctl", service);
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.arg("is-active")
|
||||||
|
.arg(format!("{}.service", service))
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
let active_status = String::from_utf8(output.stdout)?.trim().to_string();
|
||||||
|
|
||||||
|
// Get more detailed info
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.arg("show")
|
||||||
|
.arg(format!("{}.service", service))
|
||||||
|
.arg("--property=LoadState,ActiveState,SubState")
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
let detailed_info = String::from_utf8(output.stdout)?;
|
||||||
|
Ok((active_status, detailed_info))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate service status, taking user-stopped services into account
|
||||||
|
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||||
|
match active_status.to_lowercase().as_str() {
|
||||||
|
"active" => {
|
||||||
|
// If service is now active and was marked as user-stopped, clear the flag
|
||||||
|
if UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
||||||
|
debug!("Service '{}' is now active - clearing user-stopped flag", service_name);
|
||||||
|
// Note: We can't directly clear here because this is a read-only context
|
||||||
|
// The agent will need to handle this differently
|
||||||
|
}
|
||||||
|
Status::Ok
|
||||||
|
},
|
||||||
|
"inactive" | "dead" => {
|
||||||
|
// Check if this service was stopped by user action
|
||||||
|
if UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
||||||
|
debug!("Service '{}' is inactive but marked as user-stopped - treating as OK", service_name);
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Warning
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"failed" | "error" => Status::Critical,
|
||||||
|
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => {
|
||||||
|
// For user-stopped services that are transitioning, keep them as OK during transition
|
||||||
|
if UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
||||||
|
debug!("Service '{}' is transitioning but was user-stopped - treating as OK", service_name);
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Pending
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_ => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get service memory usage (if available)
|
||||||
|
fn get_service_memory(&self, service: &str) -> Option<f32> {
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.arg("show")
|
||||||
|
.arg(format!("{}.service", service))
|
||||||
|
.arg("--property=MemoryCurrent")
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.starts_with("MemoryCurrent=") {
|
||||||
|
let memory_str = line.trim_start_matches("MemoryCurrent=");
|
||||||
|
if let Ok(memory_bytes) = memory_str.parse::<u64>() {
|
||||||
|
return Some(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Get directory size in GB with permission-aware logging
|
||||||
|
fn get_directory_size(&self, dir: &str) -> Option<f32> {
|
||||||
|
let output = Command::new("sudo").arg("du").arg("-sb").arg(dir).output().ok()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
// Log permission errors for debugging but don't spam logs
|
||||||
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
if stderr.contains("Permission denied") {
|
||||||
|
debug!("Permission denied accessing directory: {}", dir);
|
||||||
|
} else {
|
||||||
|
debug!("Failed to get size for directory {}: {}", dir, stderr);
|
||||||
|
}
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||||
|
let size_str = output_str.split_whitespace().next()?;
|
||||||
|
if let Ok(size_bytes) = size_str.parse::<u64>() {
|
||||||
|
let size_gb = size_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||||
|
// Return size even if very small (minimum 0.001 GB = 1MB for visibility)
|
||||||
|
if size_gb > 0.0 {
|
||||||
|
Some(size_gb.max(0.001))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get service disk usage - simplified and configuration-driven
|
||||||
|
fn get_service_disk_usage(&self, service: &str) -> Option<f32> {
|
||||||
|
// 1. Check if service has configured directories (exact match only)
|
||||||
|
if let Some(dirs) = self.config.service_directories.get(service) {
|
||||||
|
// Service has configured paths - use the first accessible one
|
||||||
|
for dir in dirs {
|
||||||
|
if let Some(size) = self.get_directory_size(dir) {
|
||||||
|
return Some(size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If configured paths failed, return None (shows as 0)
|
||||||
|
return Some(0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. No configured path - use systemctl WorkingDirectory
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.arg("show")
|
||||||
|
.arg(format!("{}.service", service))
|
||||||
|
.arg("--property=WorkingDirectory")
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||||
|
let dir = line.trim_start_matches("WorkingDirectory=");
|
||||||
|
if !dir.is_empty() && dir != "/" {
|
||||||
|
return self.get_directory_size(dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for SystemdCollector {
|
||||||
|
|
||||||
|
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
let start_time = Instant::now();
|
||||||
|
debug!("Collecting systemd services metrics");
|
||||||
|
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
|
||||||
|
// Get cached services (discovery only happens when needed)
|
||||||
|
let monitored_services = match self.get_monitored_services() {
|
||||||
|
Ok(services) => services,
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get monitored services: {}", e);
|
||||||
|
return Ok(metrics);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Collect individual metrics for each monitored service (status, memory, disk only)
|
||||||
|
for service in &monitored_services {
|
||||||
|
match self.get_service_status(service) {
|
||||||
|
Ok((active_status, _detailed_info)) => {
|
||||||
|
let status = self.calculate_service_status(service, &active_status);
|
||||||
|
|
||||||
|
// Individual service status metric
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("service_{}_status", service),
|
||||||
|
value: MetricValue::String(active_status.clone()),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("Service {} status", service)),
|
||||||
|
status,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Service memory usage (if available)
|
||||||
|
if let Some(memory_mb) = self.get_service_memory(service) {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("service_{}_memory_mb", service),
|
||||||
|
value: MetricValue::Float(memory_mb),
|
||||||
|
unit: Some("MB".to_string()),
|
||||||
|
description: Some(format!("Service {} memory usage", service)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Service disk usage (comprehensive detection)
|
||||||
|
if let Some(disk_gb) = self.get_service_disk_usage(service) {
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("service_{}_disk_gb", service),
|
||||||
|
value: MetricValue::Float(disk_gb),
|
||||||
|
unit: Some("GB".to_string()),
|
||||||
|
description: Some(format!("Service {} disk usage", service)),
|
||||||
|
status: Status::Ok,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sub-service metrics for specific services
|
||||||
|
if service.contains("nginx") && active_status == "active" {
|
||||||
|
metrics.extend(self.get_nginx_site_metrics());
|
||||||
|
}
|
||||||
|
|
||||||
|
if service.contains("docker") && active_status == "active" {
|
||||||
|
metrics.extend(self.get_docker_containers());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get status for service {}: {}", service, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let collection_time = start_time.elapsed();
|
||||||
|
debug!(
|
||||||
|
"Systemd collection completed in {:?} with {} individual service metrics",
|
||||||
|
collection_time,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemdCollector {
|
||||||
|
/// Get nginx sites with latency checks
|
||||||
|
fn get_nginx_sites(&self) -> Vec<Metric> {
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
|
// Discover nginx sites from configuration
|
||||||
|
let sites = self.discover_nginx_sites();
|
||||||
|
|
||||||
|
for (site_name, url) in &sites {
|
||||||
|
match self.check_site_latency(url) {
|
||||||
|
Ok(latency_ms) => {
|
||||||
|
let status = if latency_ms < self.config.nginx_latency_critical_ms {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
};
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("service_nginx_{}_latency_ms", site_name),
|
||||||
|
value: MetricValue::Float(latency_ms),
|
||||||
|
unit: Some("ms".to_string()),
|
||||||
|
description: Some(format!("Response time for {}", url)),
|
||||||
|
status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Site is unreachable
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("service_nginx_{}_latency_ms", site_name),
|
||||||
|
value: MetricValue::Float(-1.0), // Use -1 to indicate error
|
||||||
|
unit: Some("ms".to_string()),
|
||||||
|
description: Some(format!("Response time for {} (unreachable)", url)),
|
||||||
|
status: Status::Critical,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get docker containers as sub-services
|
||||||
|
fn get_docker_containers(&self) -> Vec<Metric> {
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
|
// Check if docker is available
|
||||||
|
let output = Command::new("docker")
|
||||||
|
.arg("ps")
|
||||||
|
.arg("--format")
|
||||||
|
.arg("{{.Names}},{{.Status}}")
|
||||||
|
.output();
|
||||||
|
|
||||||
|
let output = match output {
|
||||||
|
Ok(out) if out.status.success() => out,
|
||||||
|
_ => return metrics, // Docker not available or failed
|
||||||
|
};
|
||||||
|
|
||||||
|
let output_str = match String::from_utf8(output.stdout) {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(_) => return metrics,
|
||||||
|
};
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parts: Vec<&str> = line.split(',').collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
let container_name = parts[0].trim();
|
||||||
|
let status_str = parts[1].trim();
|
||||||
|
|
||||||
|
let status = if status_str.contains("Up") {
|
||||||
|
Status::Ok
|
||||||
|
} else if status_str.contains("Exited") {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
};
|
||||||
|
|
||||||
|
metrics.push(Metric {
|
||||||
|
name: format!("service_docker_{}_status", container_name),
|
||||||
|
value: MetricValue::String(status_str.to_string()),
|
||||||
|
unit: None,
|
||||||
|
description: Some(format!("Docker container {} status", container_name)),
|
||||||
|
status,
|
||||||
|
timestamp,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check site latency using HTTP GET requests
|
||||||
|
fn check_site_latency(&self, url: &str) -> Result<f32, Box<dyn std::error::Error>> {
|
||||||
|
use std::time::Duration;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
// Create HTTP client with timeouts from configuration
|
||||||
|
let client = reqwest::blocking::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(self.config.http_timeout_seconds))
|
||||||
|
.connect_timeout(Duration::from_secs(self.config.http_connect_timeout_seconds))
|
||||||
|
.redirect(reqwest::redirect::Policy::limited(10))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
// Make GET request and measure latency
|
||||||
|
let response = client.get(url).send()?;
|
||||||
|
let latency = start.elapsed().as_millis() as f32;
|
||||||
|
|
||||||
|
// Check if response is successful (2xx or 3xx status codes)
|
||||||
|
if response.status().is_success() || response.status().is_redirection() {
|
||||||
|
Ok(latency)
|
||||||
|
} else {
|
||||||
|
Err(format!(
|
||||||
|
"HTTP request failed for {} with status: {}",
|
||||||
|
url,
|
||||||
|
response.status()
|
||||||
|
)
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Discover nginx sites from configuration files (like the old working implementation)
|
||||||
|
fn discover_nginx_sites(&self) -> Vec<(String, String)> {
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
// Use the same approach as the old working agent: get nginx config from systemd
|
||||||
|
let config_content = match self.get_nginx_config_from_systemd() {
|
||||||
|
Some(content) => content,
|
||||||
|
None => {
|
||||||
|
debug!("Could not get nginx config from systemd, trying nginx -T fallback");
|
||||||
|
match self.get_nginx_config_via_command() {
|
||||||
|
Some(content) => content,
|
||||||
|
None => {
|
||||||
|
debug!("Could not get nginx config via any method");
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse the config content to extract sites
|
||||||
|
self.parse_nginx_config_for_sites(&config_content)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get nginx config from systemd service definition (NixOS compatible)
|
||||||
|
fn get_nginx_config_from_systemd(&self) -> Option<String> {
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
let output = std::process::Command::new("systemctl")
|
||||||
|
.args(["show", "nginx", "--property=ExecStart", "--no-pager"])
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
debug!("Failed to get nginx ExecStart from systemd");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
debug!("systemctl show nginx output: {}", stdout);
|
||||||
|
|
||||||
|
// Parse ExecStart to extract -c config path
|
||||||
|
for line in stdout.lines() {
|
||||||
|
if line.starts_with("ExecStart=") {
|
||||||
|
debug!("Found ExecStart line: {}", line);
|
||||||
|
// Handle both traditional and NixOS systemd formats
|
||||||
|
if let Some(config_path) = self.extract_config_path_from_exec_start(line) {
|
||||||
|
debug!("Extracted config path: {}", config_path);
|
||||||
|
// Read the config file
|
||||||
|
return std::fs::read_to_string(&config_path)
|
||||||
|
.map_err(|e| debug!("Failed to read config file {}: {}", config_path, e))
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract config path from ExecStart line
|
||||||
|
fn extract_config_path_from_exec_start(&self, exec_start: &str) -> Option<String> {
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
// Remove ExecStart= prefix
|
||||||
|
let exec_part = exec_start.strip_prefix("ExecStart=")?;
|
||||||
|
debug!("Parsing exec part: {}", exec_part);
|
||||||
|
|
||||||
|
// Handle NixOS format: ExecStart={ path=...; argv[]=...nginx -c /config; ... }
|
||||||
|
if exec_part.contains("argv[]=") {
|
||||||
|
// Extract the part after argv[]=
|
||||||
|
let argv_start = exec_part.find("argv[]=")?;
|
||||||
|
let argv_part = &exec_part[argv_start + 7..]; // Skip "argv[]="
|
||||||
|
debug!("Found NixOS argv part: {}", argv_part);
|
||||||
|
|
||||||
|
// Look for -c flag followed by config path
|
||||||
|
if let Some(c_pos) = argv_part.find(" -c ") {
|
||||||
|
let after_c = &argv_part[c_pos + 4..];
|
||||||
|
// Find the config path (until next space or semicolon)
|
||||||
|
let config_path = after_c.split([' ', ';']).next()?;
|
||||||
|
return Some(config_path.to_string());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Handle traditional format: ExecStart=/path/nginx -c /config
|
||||||
|
debug!("Parsing traditional format");
|
||||||
|
if let Some(c_pos) = exec_part.find(" -c ") {
|
||||||
|
let after_c = &exec_part[c_pos + 4..];
|
||||||
|
let config_path = after_c.split_whitespace().next()?;
|
||||||
|
return Some(config_path.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fallback: get nginx config via nginx -T command
|
||||||
|
fn get_nginx_config_via_command(&self) -> Option<String> {
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
let output = std::process::Command::new("nginx")
|
||||||
|
.args(["-T"])
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
debug!("nginx -T failed");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(String::from_utf8_lossy(&output.stdout).to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse nginx config content to extract server names and build site list
|
||||||
|
fn parse_nginx_config_for_sites(&self, config_content: &str) -> Vec<(String, String)> {
|
||||||
|
use tracing::debug;
|
||||||
|
let mut sites = Vec::new();
|
||||||
|
let lines: Vec<&str> = config_content.lines().collect();
|
||||||
|
let mut i = 0;
|
||||||
|
|
||||||
|
debug!("Parsing nginx config with {} lines", lines.len());
|
||||||
|
|
||||||
|
while i < lines.len() {
|
||||||
|
let line = lines[i].trim();
|
||||||
|
if line.starts_with("server") && line.contains("{") {
|
||||||
|
if let Some(server_name) = self.parse_server_block(&lines, &mut i) {
|
||||||
|
let url = format!("https://{}", server_name);
|
||||||
|
sites.push((server_name.clone(), url));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("Discovered {} nginx sites total", sites.len());
|
||||||
|
sites
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a server block to extract the primary server_name
|
||||||
|
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
|
||||||
|
use tracing::debug;
|
||||||
|
let mut server_names = Vec::new();
|
||||||
|
let mut has_redirect = false;
|
||||||
|
let mut i = *start_index + 1;
|
||||||
|
let mut brace_count = 1;
|
||||||
|
|
||||||
|
// Parse until we close the server block
|
||||||
|
while i < lines.len() && brace_count > 0 {
|
||||||
|
let trimmed = lines[i].trim();
|
||||||
|
|
||||||
|
// Track braces
|
||||||
|
brace_count += trimmed.matches('{').count();
|
||||||
|
brace_count -= trimmed.matches('}').count();
|
||||||
|
|
||||||
|
// Extract server_name
|
||||||
|
if trimmed.starts_with("server_name") {
|
||||||
|
if let Some(names_part) = trimmed.strip_prefix("server_name") {
|
||||||
|
let names_clean = names_part.trim().trim_end_matches(';');
|
||||||
|
for name in names_clean.split_whitespace() {
|
||||||
|
if name != "_"
|
||||||
|
&& !name.is_empty()
|
||||||
|
&& name.contains('.')
|
||||||
|
&& !name.starts_with('$')
|
||||||
|
{
|
||||||
|
server_names.push(name.to_string());
|
||||||
|
debug!("Found server_name in block: {}", name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Check for redirects (skip redirect-only servers)
|
||||||
|
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
|
||||||
|
has_redirect = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
*start_index = i - 1;
|
||||||
|
|
||||||
|
if !server_names.is_empty() && !has_redirect {
|
||||||
|
return Some(server_names[0].clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
118
agent/src/communication/mod.rs
Normal file
118
agent/src/communication/mod.rs
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
|
||||||
|
use tracing::{debug, info};
|
||||||
|
use zmq::{Context, Socket, SocketType};
|
||||||
|
|
||||||
|
use crate::config::ZmqConfig;
|
||||||
|
|
||||||
|
/// ZMQ communication handler for publishing metrics and receiving commands
|
||||||
|
pub struct ZmqHandler {
|
||||||
|
publisher: Socket,
|
||||||
|
command_receiver: Socket,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ZmqHandler {
|
||||||
|
pub async fn new(config: &ZmqConfig) -> Result<Self> {
|
||||||
|
let context = Context::new();
|
||||||
|
|
||||||
|
// Create publisher socket for metrics
|
||||||
|
let publisher = context.socket(SocketType::PUB)?;
|
||||||
|
let pub_bind_address = format!("tcp://{}:{}", config.bind_address, config.publisher_port);
|
||||||
|
publisher.bind(&pub_bind_address)?;
|
||||||
|
|
||||||
|
info!("ZMQ publisher bound to {}", pub_bind_address);
|
||||||
|
|
||||||
|
// Set socket options for efficiency
|
||||||
|
publisher.set_sndhwm(1000)?; // High water mark for outbound messages
|
||||||
|
publisher.set_linger(1000)?; // Linger time on close
|
||||||
|
|
||||||
|
// Create command receiver socket (PULL socket to receive commands from dashboard)
|
||||||
|
let command_receiver = context.socket(SocketType::PULL)?;
|
||||||
|
let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port);
|
||||||
|
command_receiver.bind(&cmd_bind_address)?;
|
||||||
|
|
||||||
|
info!("ZMQ command receiver bound to {}", cmd_bind_address);
|
||||||
|
|
||||||
|
// Set non-blocking mode for command receiver
|
||||||
|
command_receiver.set_rcvtimeo(0)?; // Non-blocking receive
|
||||||
|
command_receiver.set_linger(1000)?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
publisher,
|
||||||
|
command_receiver,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Publish metrics message via ZMQ
|
||||||
|
pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
|
||||||
|
debug!(
|
||||||
|
"Publishing {} metrics for host {}",
|
||||||
|
message.metrics.len(),
|
||||||
|
message.hostname
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create message envelope
|
||||||
|
let envelope = MessageEnvelope::metrics(message.clone())
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to create message envelope: {}", e))?;
|
||||||
|
|
||||||
|
// Serialize envelope
|
||||||
|
let serialized = serde_json::to_vec(&envelope)?;
|
||||||
|
|
||||||
|
// Send via ZMQ
|
||||||
|
self.publisher.send(&serialized, 0)?;
|
||||||
|
|
||||||
|
debug!("Published metrics message ({} bytes)", serialized.len());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Send heartbeat (placeholder for future use)
|
||||||
|
|
||||||
|
/// Try to receive a command (non-blocking)
|
||||||
|
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
||||||
|
match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
|
||||||
|
Ok(bytes) => {
|
||||||
|
debug!("Received command message ({} bytes)", bytes.len());
|
||||||
|
|
||||||
|
let command: AgentCommand = serde_json::from_slice(&bytes)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to deserialize command: {}", e))?;
|
||||||
|
|
||||||
|
debug!("Parsed command: {:?}", command);
|
||||||
|
Ok(Some(command))
|
||||||
|
}
|
||||||
|
Err(zmq::Error::EAGAIN) => {
|
||||||
|
// No message available (non-blocking)
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
Err(e) => Err(anyhow::anyhow!("ZMQ receive error: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commands that can be sent to the agent
|
||||||
|
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub enum AgentCommand {
|
||||||
|
/// Request immediate metric collection
|
||||||
|
CollectNow,
|
||||||
|
/// Change collection interval
|
||||||
|
SetInterval { seconds: u64 },
|
||||||
|
/// Enable/disable a collector
|
||||||
|
ToggleCollector { name: String, enabled: bool },
|
||||||
|
/// Request status/health check
|
||||||
|
Ping,
|
||||||
|
/// Control systemd service
|
||||||
|
ServiceControl {
|
||||||
|
service_name: String,
|
||||||
|
action: ServiceAction,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service control actions
|
||||||
|
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub enum ServiceAction {
|
||||||
|
Start,
|
||||||
|
Stop,
|
||||||
|
Status,
|
||||||
|
UserStart, // User-initiated start (clears user-stopped flag)
|
||||||
|
UserStop, // User-initiated stop (marks as user-stopped)
|
||||||
|
}
|
||||||
2
agent/src/config/defaults.rs
Normal file
2
agent/src/config/defaults.rs
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
// This file is now empty - all configuration values come from config files
|
||||||
|
// No hardcoded defaults are used
|
||||||
19
agent/src/config/loader.rs
Normal file
19
agent/src/config/loader.rs
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
use crate::config::AgentConfig;
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
let content = fs::read_to_string(path)
|
||||||
|
.with_context(|| format!("Failed to read config file: {}", path.display()))?;
|
||||||
|
|
||||||
|
let config: AgentConfig = toml::from_str(&content)
|
||||||
|
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
|
||||||
|
|
||||||
|
config
|
||||||
|
.validate()
|
||||||
|
.with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
|
||||||
|
|
||||||
|
Ok(config)
|
||||||
|
}
|
||||||
160
agent/src/config/mod.rs
Normal file
160
agent/src/config/mod.rs
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use cm_dashboard_shared::CacheConfig;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
pub mod loader;
|
||||||
|
pub mod validation;
|
||||||
|
|
||||||
|
use crate::status::HostStatusConfig;
|
||||||
|
|
||||||
|
/// Main agent configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct AgentConfig {
|
||||||
|
pub zmq: ZmqConfig,
|
||||||
|
pub collectors: CollectorConfig,
|
||||||
|
pub cache: CacheConfig,
|
||||||
|
pub notifications: NotificationConfig,
|
||||||
|
pub status_aggregation: HostStatusConfig,
|
||||||
|
pub collection_interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ZMQ communication configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ZmqConfig {
|
||||||
|
pub publisher_port: u16,
|
||||||
|
pub command_port: u16,
|
||||||
|
pub bind_address: String,
|
||||||
|
pub timeout_ms: u64,
|
||||||
|
pub heartbeat_interval_ms: u64,
|
||||||
|
pub transmission_interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CollectorConfig {
|
||||||
|
pub cpu: CpuConfig,
|
||||||
|
pub memory: MemoryConfig,
|
||||||
|
pub disk: DiskConfig,
|
||||||
|
pub systemd: SystemdConfig,
|
||||||
|
pub backup: BackupConfig,
|
||||||
|
pub network: NetworkConfig,
|
||||||
|
pub nixos: NixOSConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CPU collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CpuConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
pub load_warning_threshold: f32,
|
||||||
|
pub load_critical_threshold: f32,
|
||||||
|
pub temperature_warning_threshold: f32,
|
||||||
|
pub temperature_critical_threshold: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Memory collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct MemoryConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
/// Memory usage warning threshold (percentage)
|
||||||
|
pub usage_warning_percent: f32,
|
||||||
|
/// Memory usage critical threshold (percentage)
|
||||||
|
pub usage_critical_percent: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Disk collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DiskConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
/// Disk usage warning threshold (percentage)
|
||||||
|
pub usage_warning_percent: f32,
|
||||||
|
/// Disk usage critical threshold (percentage)
|
||||||
|
pub usage_critical_percent: f32,
|
||||||
|
/// Filesystem configurations
|
||||||
|
pub filesystems: Vec<FilesystemConfig>,
|
||||||
|
/// SMART monitoring thresholds
|
||||||
|
pub temperature_warning_celsius: f32,
|
||||||
|
pub temperature_critical_celsius: f32,
|
||||||
|
pub wear_warning_percent: f32,
|
||||||
|
pub wear_critical_percent: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Filesystem configuration entry
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct FilesystemConfig {
|
||||||
|
pub name: String,
|
||||||
|
pub uuid: String,
|
||||||
|
pub mount_point: String,
|
||||||
|
pub fs_type: String, // "ext4", "zfs", "xfs", "mergerfs", "btrfs"
|
||||||
|
pub monitor: bool,
|
||||||
|
pub storage_type: String, // "single", "raid", "mirror", "mergerfs", "zfs"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Systemd services collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SystemdConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
pub service_name_filters: Vec<String>,
|
||||||
|
pub excluded_services: Vec<String>,
|
||||||
|
pub memory_warning_mb: f32,
|
||||||
|
pub memory_critical_mb: f32,
|
||||||
|
pub service_directories: std::collections::HashMap<String, Vec<String>>,
|
||||||
|
pub host_user_mapping: String,
|
||||||
|
pub nginx_check_interval_seconds: u64,
|
||||||
|
pub http_timeout_seconds: u64,
|
||||||
|
pub http_connect_timeout_seconds: u64,
|
||||||
|
pub nginx_latency_critical_ms: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// NixOS collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct NixOSConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Backup collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct BackupConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
pub backup_paths: Vec<String>,
|
||||||
|
pub max_age_hours: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Network collector configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct NetworkConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Notification configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct NotificationConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub smtp_host: String,
|
||||||
|
pub smtp_port: u16,
|
||||||
|
pub from_email: String,
|
||||||
|
pub to_email: String,
|
||||||
|
pub rate_limit_minutes: u64,
|
||||||
|
/// Email notification batching interval in seconds (default: 60)
|
||||||
|
pub aggregation_interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl AgentConfig {
|
||||||
|
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||||
|
loader::load_config(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn validate(&self) -> Result<()> {
|
||||||
|
validation::validate_config(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
127
agent/src/config/validation.rs
Normal file
127
agent/src/config/validation.rs
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
use crate::config::AgentConfig;
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
|
||||||
|
pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||||
|
// Validate ZMQ configuration
|
||||||
|
if config.zmq.publisher_port == 0 {
|
||||||
|
bail!("ZMQ publisher port cannot be 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.zmq.command_port == 0 {
|
||||||
|
bail!("ZMQ command port cannot be 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.zmq.publisher_port == config.zmq.command_port {
|
||||||
|
bail!("ZMQ publisher and command ports cannot be the same");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.zmq.bind_address.is_empty() {
|
||||||
|
bail!("ZMQ bind address cannot be empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.zmq.timeout_ms == 0 {
|
||||||
|
bail!("ZMQ timeout cannot be 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate collection interval
|
||||||
|
if config.collection_interval_seconds == 0 {
|
||||||
|
bail!("Collection interval cannot be 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate CPU thresholds
|
||||||
|
if config.collectors.cpu.enabled {
|
||||||
|
if config.collectors.cpu.load_warning_threshold <= 0.0 {
|
||||||
|
bail!("CPU load warning threshold must be positive");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.cpu.load_critical_threshold
|
||||||
|
<= config.collectors.cpu.load_warning_threshold
|
||||||
|
{
|
||||||
|
bail!("CPU load critical threshold must be greater than warning threshold");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.cpu.temperature_warning_threshold <= 0.0 {
|
||||||
|
bail!("CPU temperature warning threshold must be positive");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.cpu.temperature_critical_threshold
|
||||||
|
<= config.collectors.cpu.temperature_warning_threshold
|
||||||
|
{
|
||||||
|
bail!("CPU temperature critical threshold must be greater than warning threshold");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate memory thresholds
|
||||||
|
if config.collectors.memory.enabled {
|
||||||
|
if config.collectors.memory.usage_warning_percent <= 0.0
|
||||||
|
|| config.collectors.memory.usage_warning_percent > 100.0
|
||||||
|
{
|
||||||
|
bail!("Memory usage warning threshold must be between 0 and 100");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.memory.usage_critical_percent
|
||||||
|
<= config.collectors.memory.usage_warning_percent
|
||||||
|
|| config.collectors.memory.usage_critical_percent > 100.0
|
||||||
|
{
|
||||||
|
bail!("Memory usage critical threshold must be between warning threshold and 100");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate disk thresholds
|
||||||
|
if config.collectors.disk.enabled {
|
||||||
|
if config.collectors.disk.usage_warning_percent <= 0.0
|
||||||
|
|| config.collectors.disk.usage_warning_percent > 100.0
|
||||||
|
{
|
||||||
|
bail!("Disk usage warning threshold must be between 0 and 100");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.disk.usage_critical_percent
|
||||||
|
<= config.collectors.disk.usage_warning_percent
|
||||||
|
|| config.collectors.disk.usage_critical_percent > 100.0
|
||||||
|
{
|
||||||
|
bail!("Disk usage critical threshold must be between warning threshold and 100");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate systemd configuration
|
||||||
|
if config.collectors.systemd.enabled {
|
||||||
|
if config.collectors.systemd.nginx_latency_critical_ms <= 0.0 {
|
||||||
|
bail!("Nginx latency critical threshold must be positive");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate SMTP configuration
|
||||||
|
if config.notifications.enabled {
|
||||||
|
if config.notifications.smtp_host.is_empty() {
|
||||||
|
bail!("SMTP host cannot be empty when notifications are enabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.notifications.smtp_port == 0 {
|
||||||
|
bail!("SMTP port cannot be 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.notifications.from_email.is_empty() {
|
||||||
|
bail!("From email cannot be empty when notifications are enabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.notifications.to_email.is_empty() {
|
||||||
|
bail!("To email cannot be empty when notifications are enabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic email validation
|
||||||
|
if !config.notifications.from_email.contains('@') {
|
||||||
|
bail!("From email must contain @ symbol");
|
||||||
|
}
|
||||||
|
|
||||||
|
if !config.notifications.to_email.contains('@') {
|
||||||
|
bail!("To email must contain @ symbol");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate cache configuration
|
||||||
|
if config.cache.persist_path.is_empty() {
|
||||||
|
bail!("Cache persist path cannot be empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -1,444 +0,0 @@
|
|||||||
use std::collections::HashSet;
|
|
||||||
use std::process::Stdio;
|
|
||||||
use tokio::fs;
|
|
||||||
use tokio::process::Command;
|
|
||||||
use tracing::{debug, warn};
|
|
||||||
|
|
||||||
use crate::collectors::CollectorError;
|
|
||||||
|
|
||||||
pub struct AutoDiscovery;
|
|
||||||
|
|
||||||
impl AutoDiscovery {
|
|
||||||
/// Auto-detect storage devices suitable for SMART monitoring
|
|
||||||
pub async fn discover_storage_devices() -> Vec<String> {
|
|
||||||
let mut devices = Vec::new();
|
|
||||||
|
|
||||||
// Method 1: Try lsblk to find block devices
|
|
||||||
if let Ok(lsblk_devices) = Self::discover_via_lsblk().await {
|
|
||||||
devices.extend(lsblk_devices);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 2: Scan /dev for common device patterns
|
|
||||||
if devices.is_empty() {
|
|
||||||
if let Ok(dev_devices) = Self::discover_via_dev_scan().await {
|
|
||||||
devices.extend(dev_devices);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 3: Fallback to common device names
|
|
||||||
if devices.is_empty() {
|
|
||||||
devices = Self::fallback_device_names();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove duplicates and sort
|
|
||||||
let mut unique_devices: Vec<String> = devices
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>()
|
|
||||||
.into_iter()
|
|
||||||
.collect();
|
|
||||||
unique_devices.sort();
|
|
||||||
|
|
||||||
debug!("Auto-detected storage devices: {:?}", unique_devices);
|
|
||||||
unique_devices
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn discover_via_lsblk() -> Result<Vec<String>, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/lsblk")
|
|
||||||
.args(["-d", "-o", "NAME,TYPE", "-n", "-r"])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: "lsblk".to_string(),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: "lsblk".to_string(),
|
|
||||||
message: String::from_utf8_lossy(&output.stderr).to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let mut devices = Vec::new();
|
|
||||||
|
|
||||||
for line in stdout.lines() {
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if parts.len() >= 2 {
|
|
||||||
let device_name = parts[0];
|
|
||||||
let device_type = parts[1];
|
|
||||||
|
|
||||||
// Include disk type devices and filter out unwanted ones
|
|
||||||
if device_type == "disk" && Self::is_suitable_device(device_name) {
|
|
||||||
devices.push(device_name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(devices)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn discover_via_dev_scan() -> Result<Vec<String>, CollectorError> {
|
|
||||||
let mut devices = Vec::new();
|
|
||||||
|
|
||||||
// Read /dev directory
|
|
||||||
let mut dev_entries = fs::read_dir("/dev")
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError {
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
while let Some(entry) =
|
|
||||||
dev_entries
|
|
||||||
.next_entry()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::IoError {
|
|
||||||
message: e.to_string(),
|
|
||||||
})?
|
|
||||||
{
|
|
||||||
let file_name = entry.file_name();
|
|
||||||
let device_name = file_name.to_string_lossy();
|
|
||||||
|
|
||||||
if Self::is_suitable_device(&device_name) {
|
|
||||||
devices.push(device_name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(devices)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_suitable_device(device_name: &str) -> bool {
|
|
||||||
// Include NVMe, SATA, and other storage devices
|
|
||||||
// Exclude partitions, loop devices, etc.
|
|
||||||
(device_name.starts_with("nvme") && device_name.contains("n") && !device_name.contains("p")) ||
|
|
||||||
(device_name.starts_with("sd") && device_name.len() == 3) || // sda, sdb, etc. not sda1
|
|
||||||
(device_name.starts_with("hd") && device_name.len() == 3) || // hda, hdb, etc.
|
|
||||||
(device_name.starts_with("vd") && device_name.len() == 3) // vda, vdb for VMs
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fallback_device_names() -> Vec<String> {
|
|
||||||
vec!["nvme0n1".to_string(), "sda".to_string(), "sdb".to_string()]
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Auto-detect systemd services suitable for monitoring
|
|
||||||
pub async fn discover_services() -> Vec<String> {
|
|
||||||
let mut services = Vec::new();
|
|
||||||
|
|
||||||
// Method 1: Try to find running services
|
|
||||||
if let Ok(running_services) = Self::discover_running_services().await {
|
|
||||||
services.extend(running_services);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 2: Add host-specific services based on hostname
|
|
||||||
let hostname = gethostname::gethostname().to_string_lossy().to_string();
|
|
||||||
services.extend(Self::get_host_specific_services(&hostname));
|
|
||||||
|
|
||||||
// Normalize aliases and verify the units actually exist before deduping
|
|
||||||
let canonicalized: Vec<String> = services
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|svc| Self::canonical_service_name(&svc))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let existing = Self::filter_existing_services(&canonicalized).await;
|
|
||||||
|
|
||||||
let mut unique_services: Vec<String> = existing
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>()
|
|
||||||
.into_iter()
|
|
||||||
.collect();
|
|
||||||
unique_services.sort();
|
|
||||||
|
|
||||||
debug!("Auto-detected services: {:?}", unique_services);
|
|
||||||
unique_services
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn discover_running_services() -> Result<Vec<String>, CollectorError> {
|
|
||||||
let output = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args([
|
|
||||||
"list-units",
|
|
||||||
"--type=service",
|
|
||||||
"--state=active",
|
|
||||||
"--no-pager",
|
|
||||||
"--no-legend",
|
|
||||||
])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::CommandFailed {
|
|
||||||
command: "systemctl list-units".to_string(),
|
|
||||||
message: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return Err(CollectorError::CommandFailed {
|
|
||||||
command: "systemctl list-units".to_string(),
|
|
||||||
message: String::from_utf8_lossy(&output.stderr).to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let mut services = Vec::new();
|
|
||||||
|
|
||||||
for line in stdout.lines() {
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if !parts.is_empty() {
|
|
||||||
let service_name = parts[0];
|
|
||||||
// Remove .service suffix if present
|
|
||||||
let clean_name = service_name
|
|
||||||
.strip_suffix(".service")
|
|
||||||
.unwrap_or(service_name);
|
|
||||||
|
|
||||||
// Only include services we're interested in monitoring
|
|
||||||
if Self::is_monitorable_service(clean_name) {
|
|
||||||
services.push(clean_name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(services)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_monitorable_service(service_name: &str) -> bool {
|
|
||||||
// Skip setup/certificate services that don't need monitoring
|
|
||||||
let excluded_services = [
|
|
||||||
"mosquitto-certs",
|
|
||||||
"immich-setup",
|
|
||||||
"phpfpm-kryddorten",
|
|
||||||
"phpfpm-mariehall2",
|
|
||||||
];
|
|
||||||
|
|
||||||
for excluded in &excluded_services {
|
|
||||||
if service_name.contains(excluded) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Define patterns for services we want to monitor
|
|
||||||
let interesting_services = [
|
|
||||||
// Web applications
|
|
||||||
"gitea",
|
|
||||||
"immich",
|
|
||||||
"vaultwarden",
|
|
||||||
"unifi",
|
|
||||||
"wordpress",
|
|
||||||
"nginx",
|
|
||||||
"httpd",
|
|
||||||
// Databases
|
|
||||||
"postgresql",
|
|
||||||
"mysql",
|
|
||||||
"mariadb",
|
|
||||||
"redis",
|
|
||||||
"mongodb",
|
|
||||||
"mongod",
|
|
||||||
// Backup and storage
|
|
||||||
"borg",
|
|
||||||
"rclone",
|
|
||||||
// Container runtimes
|
|
||||||
"docker",
|
|
||||||
// CI/CD services
|
|
||||||
"gitea-actions",
|
|
||||||
"gitea-runner",
|
|
||||||
"actions-runner",
|
|
||||||
// Network services
|
|
||||||
"sshd",
|
|
||||||
"dnsmasq",
|
|
||||||
// MQTT and IoT services
|
|
||||||
"mosquitto",
|
|
||||||
"mqtt",
|
|
||||||
// PHP-FPM services
|
|
||||||
"phpfpm",
|
|
||||||
// Home automation
|
|
||||||
"haasp",
|
|
||||||
// Backup services
|
|
||||||
"backup",
|
|
||||||
];
|
|
||||||
|
|
||||||
// Check if service name contains any of our interesting patterns
|
|
||||||
interesting_services
|
|
||||||
.iter()
|
|
||||||
.any(|&pattern| service_name.contains(pattern) || pattern.contains(service_name))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_host_specific_services(_hostname: &str) -> Vec<String> {
|
|
||||||
// Pure auto-discovery - no hardcoded host-specific services
|
|
||||||
vec![]
|
|
||||||
}
|
|
||||||
|
|
||||||
fn canonical_service_name(service: &str) -> Option<String> {
|
|
||||||
let trimmed = service.trim();
|
|
||||||
if trimmed.is_empty() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let lower = trimmed.to_lowercase();
|
|
||||||
let aliases = [
|
|
||||||
("ssh", "sshd"),
|
|
||||||
("sshd", "sshd"),
|
|
||||||
("docker.service", "docker"),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (alias, target) in aliases {
|
|
||||||
if lower == alias {
|
|
||||||
return Some(target.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(trimmed.to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn filter_existing_services(services: &[String]) -> Vec<String> {
|
|
||||||
let mut existing = Vec::new();
|
|
||||||
|
|
||||||
for service in services {
|
|
||||||
if Self::service_exists(service).await {
|
|
||||||
existing.push(service.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
existing
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn service_exists(service: &str) -> bool {
|
|
||||||
let unit = if service.ends_with(".service") {
|
|
||||||
service.to_string()
|
|
||||||
} else {
|
|
||||||
format!("{}.service", service)
|
|
||||||
};
|
|
||||||
|
|
||||||
match Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["status", &unit])
|
|
||||||
.stdout(Stdio::null())
|
|
||||||
.stderr(Stdio::null())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(output) => output.status.success(),
|
|
||||||
Err(error) => {
|
|
||||||
warn!("Failed to check service {}: {}", unit, error);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Auto-detect backup configuration
|
|
||||||
pub async fn discover_backup_config(hostname: &str) -> (bool, Option<String>, String) {
|
|
||||||
// Check if this host should have backup monitoring
|
|
||||||
let backup_enabled = hostname == "srv01" || Self::has_backup_service().await;
|
|
||||||
|
|
||||||
// Try to find restic repository
|
|
||||||
let restic_repo = if backup_enabled {
|
|
||||||
Self::discover_restic_repo().await
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// Determine backup service name
|
|
||||||
let backup_service = Self::discover_backup_service()
|
|
||||||
.await
|
|
||||||
.unwrap_or_else(|| "restic-backup".to_string());
|
|
||||||
|
|
||||||
(backup_enabled, restic_repo, backup_service)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn has_backup_service() -> bool {
|
|
||||||
// Check for common backup services
|
|
||||||
let backup_services = ["restic", "borg", "duplicati", "rclone"];
|
|
||||||
|
|
||||||
for service in backup_services {
|
|
||||||
if let Ok(output) = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["is-enabled", service])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
if output.status.success() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn discover_restic_repo() -> Option<String> {
|
|
||||||
// Common restic repository locations
|
|
||||||
let common_paths = [
|
|
||||||
"/srv/backups/restic",
|
|
||||||
"/var/backups/restic",
|
|
||||||
"/home/restic",
|
|
||||||
"/backup/restic",
|
|
||||||
"/mnt/backup/restic",
|
|
||||||
];
|
|
||||||
|
|
||||||
for path in common_paths {
|
|
||||||
if fs::metadata(path).await.is_ok() {
|
|
||||||
debug!("Found restic repository at: {}", path);
|
|
||||||
return Some(path.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to find via environment variables or config files
|
|
||||||
if let Ok(content) = fs::read_to_string("/etc/restic/repository").await {
|
|
||||||
let repo_path = content.trim();
|
|
||||||
if !repo_path.is_empty() {
|
|
||||||
return Some(repo_path.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn discover_backup_service() -> Option<String> {
|
|
||||||
let backup_services = ["restic-backup", "restic", "borg-backup", "borg", "backup"];
|
|
||||||
|
|
||||||
for service in backup_services {
|
|
||||||
if let Ok(output) = Command::new("/run/current-system/sw/bin/systemctl")
|
|
||||||
.args(["is-enabled", &format!("{}.service", service)])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
if output.status.success() {
|
|
||||||
return Some(service.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Validate auto-detected configuration
|
|
||||||
pub async fn validate_devices(devices: &[String]) -> Vec<String> {
|
|
||||||
let mut valid_devices = Vec::new();
|
|
||||||
|
|
||||||
for device in devices {
|
|
||||||
if Self::can_access_device(device).await {
|
|
||||||
valid_devices.push(device.clone());
|
|
||||||
} else {
|
|
||||||
warn!("Cannot access device {}, skipping", device);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
valid_devices
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn can_access_device(device: &str) -> bool {
|
|
||||||
let device_path = format!("/dev/{}", device);
|
|
||||||
|
|
||||||
// Try to run smartctl to see if device is accessible
|
|
||||||
if let Ok(output) = Command::new("sudo")
|
|
||||||
.args(["/run/current-system/sw/bin/smartctl", "-i", &device_path])
|
|
||||||
.stdout(Stdio::piped())
|
|
||||||
.stderr(Stdio::piped())
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
// smartctl returns 0 for success, but may return other codes for warnings
|
|
||||||
// that are still acceptable (like device supports SMART but has some issues)
|
|
||||||
output.status.code().map_or(false, |code| code <= 4)
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,66 +1,94 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use tokio::signal;
|
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
use tracing_subscriber::EnvFilter;
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
|
mod agent;
|
||||||
mod collectors;
|
mod collectors;
|
||||||
mod discovery;
|
mod communication;
|
||||||
|
mod config;
|
||||||
|
mod metrics;
|
||||||
mod notifications;
|
mod notifications;
|
||||||
mod simple_agent;
|
mod service_tracker;
|
||||||
|
mod status;
|
||||||
|
|
||||||
use simple_agent::SimpleAgent;
|
use agent::Agent;
|
||||||
|
|
||||||
|
/// Get version showing cm-dashboard-agent package hash for easy deployment verification
|
||||||
|
fn get_version() -> &'static str {
|
||||||
|
// Get the path of the current executable
|
||||||
|
let exe_path = std::env::current_exe().expect("Failed to get executable path");
|
||||||
|
let exe_str = exe_path.to_string_lossy();
|
||||||
|
|
||||||
|
// Extract Nix store hash from path like /nix/store/HASH-cm-dashboard-v0.1.8/bin/cm-dashboard-agent
|
||||||
|
let hash_part = exe_str.strip_prefix("/nix/store/").expect("Not a nix store path");
|
||||||
|
let hash = hash_part.split('-').next().expect("Invalid nix store path format");
|
||||||
|
assert!(hash.len() >= 8, "Hash too short");
|
||||||
|
|
||||||
|
// Return first 8 characters of nix store hash
|
||||||
|
let short_hash = hash[..8].to_string();
|
||||||
|
Box::leak(short_hash.into_boxed_str())
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(name = "cm-dashboard-agent")]
|
#[command(name = "cm-dashboard-agent")]
|
||||||
#[command(about = "CM Dashboard metrics agent with auto-detection")]
|
#[command(about = "CM Dashboard metrics agent with individual metric collection")]
|
||||||
#[command(version)]
|
#[command(version = get_version())]
|
||||||
struct Cli {
|
struct Cli {
|
||||||
/// Increase logging verbosity (-v, -vv)
|
/// Increase logging verbosity (-v, -vv)
|
||||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||||
verbose: u8,
|
verbose: u8,
|
||||||
|
|
||||||
|
/// Configuration file path (required)
|
||||||
|
#[arg(short, long)]
|
||||||
|
config: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
|
|
||||||
// Setup logging
|
// Setup logging
|
||||||
let log_level = match cli.verbose {
|
let log_level = match cli.verbose {
|
||||||
0 => "info",
|
0 => "info",
|
||||||
1 => "debug",
|
1 => "debug",
|
||||||
_ => "trace",
|
_ => "trace",
|
||||||
};
|
};
|
||||||
|
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
.with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
|
.with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
info!("CM Dashboard Agent starting...");
|
info!("CM Dashboard Agent starting with individual metrics architecture...");
|
||||||
|
|
||||||
// Create and run agent
|
// Create and run agent
|
||||||
let mut agent = SimpleAgent::new().await?;
|
let mut agent = Agent::new(Some(cli.config)).await?;
|
||||||
|
|
||||||
// Setup graceful shutdown
|
// Setup graceful shutdown channel
|
||||||
|
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
|
||||||
|
|
||||||
let ctrl_c = async {
|
let ctrl_c = async {
|
||||||
signal::ctrl_c()
|
tokio::signal::ctrl_c()
|
||||||
.await
|
.await
|
||||||
.expect("failed to install Ctrl+C handler");
|
.expect("failed to install Ctrl+C handler");
|
||||||
};
|
};
|
||||||
|
|
||||||
// Run agent with graceful shutdown
|
// Run agent with graceful shutdown
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
result = agent.run() => {
|
result = agent.run(shutdown_rx) => {
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
error!("Agent error: {}", e);
|
error!("Agent error: {}", e);
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ = ctrl_c => {
|
_ = ctrl_c => {
|
||||||
info!("Shutdown signal received");
|
info!("Shutdown signal received, stopping agent...");
|
||||||
|
let _ = shutdown_tx.send(());
|
||||||
|
// Give agent time to shutdown gracefully
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Agent shutdown complete");
|
info!("Agent shutdown complete");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
264
agent/src/metrics/mod.rs
Normal file
264
agent/src/metrics/mod.rs
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
|
use crate::collectors::{
|
||||||
|
backup::BackupCollector, cpu::CpuCollector, disk::DiskCollector, memory::MemoryCollector,
|
||||||
|
nixos::NixOSCollector, systemd::SystemdCollector, Collector,
|
||||||
|
};
|
||||||
|
use crate::config::{AgentConfig, CollectorConfig};
|
||||||
|
|
||||||
|
/// Collector with timing information
|
||||||
|
struct TimedCollector {
|
||||||
|
collector: Box<dyn Collector>,
|
||||||
|
interval: Duration,
|
||||||
|
last_collection: Option<Instant>,
|
||||||
|
name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Manages all metric collectors with individual intervals
|
||||||
|
pub struct MetricCollectionManager {
|
||||||
|
collectors: Vec<TimedCollector>,
|
||||||
|
status_tracker: StatusTracker,
|
||||||
|
cached_metrics: Vec<Metric>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCollectionManager {
|
||||||
|
pub async fn new(config: &CollectorConfig, _agent_config: &AgentConfig) -> Result<Self> {
|
||||||
|
let mut collectors: Vec<TimedCollector> = Vec::new();
|
||||||
|
|
||||||
|
// Benchmark mode - only enable specific collector based on env var
|
||||||
|
let benchmark_mode = std::env::var("BENCHMARK_COLLECTOR").ok();
|
||||||
|
|
||||||
|
match benchmark_mode.as_deref() {
|
||||||
|
Some("cpu") => {
|
||||||
|
// CPU collector only
|
||||||
|
if config.cpu.enabled {
|
||||||
|
let cpu_collector = CpuCollector::new(config.cpu.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(cpu_collector),
|
||||||
|
interval: Duration::from_secs(config.cpu.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "CPU".to_string(),
|
||||||
|
});
|
||||||
|
info!("BENCHMARK: CPU collector only");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some("memory") => {
|
||||||
|
// Memory collector only
|
||||||
|
if config.memory.enabled {
|
||||||
|
let memory_collector = MemoryCollector::new(config.memory.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(memory_collector),
|
||||||
|
interval: Duration::from_secs(config.memory.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Memory".to_string(),
|
||||||
|
});
|
||||||
|
info!("BENCHMARK: Memory collector only");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some("disk") => {
|
||||||
|
// Disk collector only
|
||||||
|
let disk_collector = DiskCollector::new(config.disk.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(disk_collector),
|
||||||
|
interval: Duration::from_secs(config.disk.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Disk".to_string(),
|
||||||
|
});
|
||||||
|
info!("BENCHMARK: Disk collector only");
|
||||||
|
}
|
||||||
|
Some("systemd") => {
|
||||||
|
// Systemd collector only
|
||||||
|
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(systemd_collector),
|
||||||
|
interval: Duration::from_secs(config.systemd.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Systemd".to_string(),
|
||||||
|
});
|
||||||
|
info!("BENCHMARK: Systemd collector only");
|
||||||
|
}
|
||||||
|
Some("backup") => {
|
||||||
|
// Backup collector only
|
||||||
|
if config.backup.enabled {
|
||||||
|
let backup_collector = BackupCollector::new(
|
||||||
|
config.backup.backup_paths.first().cloned(),
|
||||||
|
config.backup.max_age_hours,
|
||||||
|
);
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(backup_collector),
|
||||||
|
interval: Duration::from_secs(config.backup.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Backup".to_string(),
|
||||||
|
});
|
||||||
|
info!("BENCHMARK: Backup collector only");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some("none") => {
|
||||||
|
// No collectors - test agent loop only
|
||||||
|
info!("BENCHMARK: No collectors enabled");
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Normal mode - all collectors
|
||||||
|
if config.cpu.enabled {
|
||||||
|
let cpu_collector = CpuCollector::new(config.cpu.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(cpu_collector),
|
||||||
|
interval: Duration::from_secs(config.cpu.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "CPU".to_string(),
|
||||||
|
});
|
||||||
|
info!("CPU collector initialized with {}s interval", config.cpu.interval_seconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.memory.enabled {
|
||||||
|
let memory_collector = MemoryCollector::new(config.memory.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(memory_collector),
|
||||||
|
interval: Duration::from_secs(config.memory.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Memory".to_string(),
|
||||||
|
});
|
||||||
|
info!("Memory collector initialized with {}s interval", config.memory.interval_seconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
let disk_collector = DiskCollector::new(config.disk.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(disk_collector),
|
||||||
|
interval: Duration::from_secs(config.disk.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Disk".to_string(),
|
||||||
|
});
|
||||||
|
info!("Disk collector initialized with {}s interval", config.disk.interval_seconds);
|
||||||
|
|
||||||
|
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(systemd_collector),
|
||||||
|
interval: Duration::from_secs(config.systemd.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Systemd".to_string(),
|
||||||
|
});
|
||||||
|
info!("Systemd collector initialized with {}s interval", config.systemd.interval_seconds);
|
||||||
|
|
||||||
|
if config.backup.enabled {
|
||||||
|
let backup_collector = BackupCollector::new(
|
||||||
|
config.backup.backup_paths.first().cloned(),
|
||||||
|
config.backup.max_age_hours,
|
||||||
|
);
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(backup_collector),
|
||||||
|
interval: Duration::from_secs(config.backup.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "Backup".to_string(),
|
||||||
|
});
|
||||||
|
info!("Backup collector initialized with {}s interval", config.backup.interval_seconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.nixos.enabled {
|
||||||
|
let nixos_collector = NixOSCollector::new(config.nixos.clone());
|
||||||
|
collectors.push(TimedCollector {
|
||||||
|
collector: Box::new(nixos_collector),
|
||||||
|
interval: Duration::from_secs(config.nixos.interval_seconds),
|
||||||
|
last_collection: None,
|
||||||
|
name: "NixOS".to_string(),
|
||||||
|
});
|
||||||
|
info!("NixOS collector initialized with {}s interval", config.nixos.interval_seconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Metric collection manager initialized with {} collectors",
|
||||||
|
collectors.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
collectors,
|
||||||
|
status_tracker: StatusTracker::new(),
|
||||||
|
cached_metrics: Vec::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Force collection from ALL collectors immediately (used at startup)
|
||||||
|
pub async fn collect_all_metrics_force(&mut self) -> Result<Vec<Metric>> {
|
||||||
|
let mut all_metrics = Vec::new();
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
for timed_collector in &mut self.collectors {
|
||||||
|
match timed_collector.collector.collect(&mut self.status_tracker).await {
|
||||||
|
Ok(metrics) => {
|
||||||
|
let metric_count = metrics.len();
|
||||||
|
all_metrics.extend(metrics);
|
||||||
|
timed_collector.last_collection = Some(now);
|
||||||
|
debug!("Force collected {} metrics from {}", metric_count, timed_collector.name);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache the collected metrics
|
||||||
|
self.cached_metrics = all_metrics.clone();
|
||||||
|
Ok(all_metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect metrics from collectors whose intervals have elapsed
|
||||||
|
pub async fn collect_metrics_timed(&mut self) -> Result<Vec<Metric>> {
|
||||||
|
let mut all_metrics = Vec::new();
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
for timed_collector in &mut self.collectors {
|
||||||
|
let should_collect = match timed_collector.last_collection {
|
||||||
|
None => true, // First collection
|
||||||
|
Some(last_time) => now.duration_since(last_time) >= timed_collector.interval,
|
||||||
|
};
|
||||||
|
|
||||||
|
if should_collect {
|
||||||
|
match timed_collector.collector.collect(&mut self.status_tracker).await {
|
||||||
|
Ok(metrics) => {
|
||||||
|
let metric_count = metrics.len();
|
||||||
|
all_metrics.extend(metrics);
|
||||||
|
timed_collector.last_collection = Some(now);
|
||||||
|
debug!(
|
||||||
|
"Collected {} metrics from {} ({}s interval)",
|
||||||
|
metric_count,
|
||||||
|
timed_collector.name,
|
||||||
|
timed_collector.interval.as_secs()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update cache with newly collected metrics
|
||||||
|
if !all_metrics.is_empty() {
|
||||||
|
// Merge new metrics with cached metrics (replace by name)
|
||||||
|
for new_metric in &all_metrics {
|
||||||
|
// Remove any existing metric with the same name
|
||||||
|
self.cached_metrics.retain(|cached| cached.name != new_metric.name);
|
||||||
|
// Add the new metric
|
||||||
|
self.cached_metrics.push(new_metric.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(all_metrics)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect metrics from all collectors (legacy method for compatibility)
|
||||||
|
pub async fn collect_all_metrics(&mut self) -> Result<Vec<Metric>> {
|
||||||
|
self.collect_metrics_timed().await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get cached metrics without triggering fresh collection
|
||||||
|
pub fn get_cached_metrics(&self) -> Vec<Metric> {
|
||||||
|
self.cached_metrics.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -1,245 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::path::Path;
|
|
||||||
use chrono::{DateTime, Utc};
|
|
||||||
use chrono_tz::Europe::Stockholm;
|
|
||||||
use lettre::{Message, SmtpTransport, Transport};
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use tracing::{info, error, warn};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct NotificationConfig {
|
|
||||||
pub enabled: bool,
|
|
||||||
pub smtp_host: String,
|
|
||||||
pub smtp_port: u16,
|
|
||||||
pub from_email: String,
|
|
||||||
pub to_email: String,
|
|
||||||
pub rate_limit_minutes: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for NotificationConfig {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
enabled: false,
|
|
||||||
smtp_host: "localhost".to_string(),
|
|
||||||
smtp_port: 25,
|
|
||||||
from_email: "".to_string(),
|
|
||||||
to_email: "".to_string(),
|
|
||||||
rate_limit_minutes: 30, // Don't spam notifications
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
|
||||||
pub struct StatusChange {
|
|
||||||
pub component: String,
|
|
||||||
pub metric: String,
|
|
||||||
pub old_status: String,
|
|
||||||
pub new_status: String,
|
|
||||||
pub timestamp: DateTime<Utc>,
|
|
||||||
pub details: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct NotificationManager {
|
|
||||||
config: NotificationConfig,
|
|
||||||
last_status: HashMap<String, String>, // key: "component.metric", value: status
|
|
||||||
last_details: HashMap<String, String>, // key: "component.metric", value: details from warning/critical
|
|
||||||
last_notification: HashMap<String, DateTime<Utc>>, // Rate limiting
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NotificationManager {
|
|
||||||
pub fn new(config: NotificationConfig) -> Self {
|
|
||||||
Self {
|
|
||||||
config,
|
|
||||||
last_status: HashMap::new(),
|
|
||||||
last_details: HashMap::new(),
|
|
||||||
last_notification: HashMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn update_status(&mut self, component: &str, metric: &str, status: &str) -> Option<StatusChange> {
|
|
||||||
self.update_status_with_details(component, metric, status, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn update_status_with_details(&mut self, component: &str, metric: &str, status: &str, details: Option<String>) -> Option<StatusChange> {
|
|
||||||
let key = format!("{}.{}", component, metric);
|
|
||||||
let old_status = self.last_status.get(&key).cloned();
|
|
||||||
|
|
||||||
if let Some(old) = &old_status {
|
|
||||||
if old != status {
|
|
||||||
// For recovery notifications, include original problem details
|
|
||||||
let change_details = if status == "ok" && (old == "warning" || old == "critical") {
|
|
||||||
// Recovery: combine current status details with what we recovered from
|
|
||||||
let old_details = self.last_details.get(&key).cloned();
|
|
||||||
match (old_details, &details) {
|
|
||||||
(Some(old_detail), Some(current_detail)) => Some(format!("Recovered from: {}\nCurrent status: {}", old_detail, current_detail)),
|
|
||||||
(Some(old_detail), None) => Some(format!("Recovered from: {}", old_detail)),
|
|
||||||
(None, current) => current.clone(),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
details.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
let change = StatusChange {
|
|
||||||
component: component.to_string(),
|
|
||||||
metric: metric.to_string(),
|
|
||||||
old_status: old.clone(),
|
|
||||||
new_status: status.to_string(),
|
|
||||||
timestamp: Utc::now(),
|
|
||||||
details: change_details,
|
|
||||||
};
|
|
||||||
|
|
||||||
self.last_status.insert(key.clone(), status.to_string());
|
|
||||||
|
|
||||||
// Store details for warning/critical states (for future recovery notifications)
|
|
||||||
if status == "warning" || status == "critical" {
|
|
||||||
if let Some(ref detail) = details {
|
|
||||||
self.last_details.insert(key.clone(), detail.clone());
|
|
||||||
}
|
|
||||||
} else if status == "ok" {
|
|
||||||
// Clear stored details after recovery
|
|
||||||
self.last_details.remove(&key);
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.should_notify(&change) {
|
|
||||||
return Some(change);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// First time seeing this metric - store but don't notify
|
|
||||||
self.last_status.insert(key.clone(), status.to_string());
|
|
||||||
if (status == "warning" || status == "critical") && details.is_some() {
|
|
||||||
self.last_details.insert(key, details.unwrap());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
fn should_notify(&mut self, change: &StatusChange) -> bool {
|
|
||||||
if !self.config.enabled {
|
|
||||||
info!("Notifications disabled, skipping {}.{}", change.component, change.metric);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only notify on transitions to warning/critical, or recovery to ok
|
|
||||||
let should_send = match (change.old_status.as_str(), change.new_status.as_str()) {
|
|
||||||
(_, "warning") | (_, "critical") => true,
|
|
||||||
("warning" | "critical", "ok") => true,
|
|
||||||
_ => false,
|
|
||||||
};
|
|
||||||
|
|
||||||
info!("Status change {}.{}: {} -> {} (notify: {})",
|
|
||||||
change.component, change.metric, change.old_status, change.new_status, should_send);
|
|
||||||
|
|
||||||
should_send
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_rate_limited(&mut self, change: &StatusChange) -> bool {
|
|
||||||
let key = format!("{}.{}", change.component, change.metric);
|
|
||||||
|
|
||||||
if let Some(last_time) = self.last_notification.get(&key) {
|
|
||||||
let minutes_since = Utc::now().signed_duration_since(*last_time).num_minutes();
|
|
||||||
if minutes_since < self.config.rate_limit_minutes as i64 {
|
|
||||||
info!("Rate limiting {}.{}: {} minutes since last notification (limit: {})",
|
|
||||||
change.component, change.metric, minutes_since, self.config.rate_limit_minutes);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.last_notification.insert(key.clone(), Utc::now());
|
|
||||||
info!("Not rate limited {}.{}, sending notification", change.component, change.metric);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_maintenance_mode() -> bool {
|
|
||||||
Path::new("/tmp/cm-maintenance").exists()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn send_notification(&mut self, change: StatusChange) {
|
|
||||||
if !self.config.enabled {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if Self::is_maintenance_mode() {
|
|
||||||
info!("Suppressing notification for {}.{} (maintenance mode active)", change.component, change.metric);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.is_rate_limited(&change) {
|
|
||||||
warn!("Rate limiting notification for {}.{}", change.component, change.metric);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let subject = self.format_subject(&change);
|
|
||||||
let body = self.format_body(&change);
|
|
||||||
|
|
||||||
if let Err(e) = self.send_email(&subject, &body).await {
|
|
||||||
error!("Failed to send notification email: {}", e);
|
|
||||||
} else {
|
|
||||||
info!("Sent notification: {} {}.{} {} → {}",
|
|
||||||
change.component, change.component, change.metric,
|
|
||||||
change.old_status, change.new_status);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_subject(&self, change: &StatusChange) -> String {
|
|
||||||
let urgency = match change.new_status.as_str() {
|
|
||||||
"critical" => "🔴 CRITICAL",
|
|
||||||
"warning" => "🟡 WARNING",
|
|
||||||
"ok" => "✅ RESOLVED",
|
|
||||||
_ => "ℹ️ STATUS",
|
|
||||||
};
|
|
||||||
|
|
||||||
format!("{}: {} {} on {}",
|
|
||||||
urgency,
|
|
||||||
change.component,
|
|
||||||
change.metric,
|
|
||||||
gethostname::gethostname().to_string_lossy())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_body(&self, change: &StatusChange) -> String {
|
|
||||||
let mut body = format!(
|
|
||||||
"Status Change Alert\n\
|
|
||||||
\n\
|
|
||||||
Host: {}\n\
|
|
||||||
Component: {}\n\
|
|
||||||
Metric: {}\n\
|
|
||||||
Status Change: {} → {}\n\
|
|
||||||
Time: {}",
|
|
||||||
gethostname::gethostname().to_string_lossy(),
|
|
||||||
change.component,
|
|
||||||
change.metric,
|
|
||||||
change.old_status,
|
|
||||||
change.new_status,
|
|
||||||
change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
|
|
||||||
);
|
|
||||||
|
|
||||||
if let Some(details) = &change.details {
|
|
||||||
body.push_str(&format!("\n\nDetails:\n{}", details));
|
|
||||||
}
|
|
||||||
|
|
||||||
body.push_str(&format!(
|
|
||||||
"\n\n--\n\
|
|
||||||
CM Dashboard Agent\n\
|
|
||||||
Generated at {}",
|
|
||||||
Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
|
|
||||||
));
|
|
||||||
|
|
||||||
body
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn send_email(&self, subject: &str, body: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
||||||
let email = Message::builder()
|
|
||||||
.from(self.config.from_email.parse()?)
|
|
||||||
.to(self.config.to_email.parse()?)
|
|
||||||
.subject(subject)
|
|
||||||
.body(body.to_string())?;
|
|
||||||
|
|
||||||
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
|
|
||||||
.port(self.config.smtp_port)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
mailer.send(&email)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
64
agent/src/notifications/mod.rs
Normal file
64
agent/src/notifications/mod.rs
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
use crate::config::NotificationConfig;
|
||||||
|
use anyhow::Result;
|
||||||
|
use chrono::Utc;
|
||||||
|
use lettre::transport::smtp::SmtpTransport;
|
||||||
|
use lettre::{Message, Transport};
|
||||||
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
|
/// Manages notifications
|
||||||
|
pub struct NotificationManager {
|
||||||
|
config: NotificationConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NotificationManager {
|
||||||
|
pub fn new(config: &NotificationConfig, _hostname: &str) -> Result<Self> {
|
||||||
|
Ok(Self {
|
||||||
|
config: config.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn send_direct_email(&mut self, subject: &str, body: &str) -> Result<()> {
|
||||||
|
if !self.config.enabled {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.is_maintenance_mode() {
|
||||||
|
debug!("Maintenance mode active, suppressing email notification");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let hostname = gethostname::gethostname()
|
||||||
|
.to_string_lossy()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let from_email = self.config.from_email.replace("{hostname}", &hostname);
|
||||||
|
|
||||||
|
let email_body = format!(
|
||||||
|
"{}\n\n--\nCM Dashboard Agent\nGenerated at {}",
|
||||||
|
body,
|
||||||
|
Utc::now().format("%Y-%m-%d %H:%M:%S %Z")
|
||||||
|
);
|
||||||
|
|
||||||
|
let email = Message::builder()
|
||||||
|
.from(from_email.parse()?)
|
||||||
|
.to(self.config.to_email.parse()?)
|
||||||
|
.subject(subject)
|
||||||
|
.body(email_body)?;
|
||||||
|
|
||||||
|
let mailer = SmtpTransport::unencrypted_localhost();
|
||||||
|
|
||||||
|
match mailer.send(&email) {
|
||||||
|
Ok(_) => info!("Direct email sent successfully: {}", subject),
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to send email: {}", e);
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_maintenance_mode(&self) -> bool {
|
||||||
|
std::fs::metadata("/tmp/cm-maintenance").is_ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
172
agent/src/service_tracker.rs
Normal file
172
agent/src/service_tracker.rs
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::sync::{Arc, Mutex, OnceLock};
|
||||||
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
|
/// Shared instance for global access
|
||||||
|
static GLOBAL_TRACKER: OnceLock<Arc<Mutex<UserStoppedServiceTracker>>> = OnceLock::new();
|
||||||
|
|
||||||
|
/// Tracks services that have been stopped by user action
|
||||||
|
/// These services should be treated as OK status instead of Warning
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct UserStoppedServiceTracker {
|
||||||
|
/// Set of services stopped by user action
|
||||||
|
user_stopped_services: HashSet<String>,
|
||||||
|
/// Path to persistent storage file
|
||||||
|
storage_path: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Serializable data structure for persistence
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct UserStoppedData {
|
||||||
|
services: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UserStoppedServiceTracker {
|
||||||
|
/// Create new tracker with default storage path
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::with_storage_path("/var/lib/cm-dashboard/user-stopped-services.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize global instance (called by agent)
|
||||||
|
pub fn init_global() -> Result<Self> {
|
||||||
|
let tracker = Self::new();
|
||||||
|
|
||||||
|
// Set global instance
|
||||||
|
let global_instance = Arc::new(Mutex::new(tracker));
|
||||||
|
if GLOBAL_TRACKER.set(global_instance).is_err() {
|
||||||
|
warn!("Global service tracker was already initialized");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a new instance for the agent to use
|
||||||
|
Ok(Self::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a service is user-stopped (global access for collectors)
|
||||||
|
pub fn is_service_user_stopped(service_name: &str) -> bool {
|
||||||
|
if let Some(global) = GLOBAL_TRACKER.get() {
|
||||||
|
if let Ok(tracker) = global.lock() {
|
||||||
|
tracker.is_user_stopped(service_name)
|
||||||
|
} else {
|
||||||
|
debug!("Failed to lock global service tracker");
|
||||||
|
false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!("Global service tracker not initialized");
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update global tracker (called by agent when tracker state changes)
|
||||||
|
pub fn update_global(updated_tracker: &UserStoppedServiceTracker) {
|
||||||
|
if let Some(global) = GLOBAL_TRACKER.get() {
|
||||||
|
if let Ok(mut tracker) = global.lock() {
|
||||||
|
tracker.user_stopped_services = updated_tracker.user_stopped_services.clone();
|
||||||
|
} else {
|
||||||
|
debug!("Failed to lock global service tracker for update");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!("Global service tracker not initialized for update");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create new tracker with custom storage path
|
||||||
|
pub fn with_storage_path<P: AsRef<Path>>(storage_path: P) -> Self {
|
||||||
|
let storage_path = storage_path.as_ref().to_string_lossy().to_string();
|
||||||
|
let mut tracker = Self {
|
||||||
|
user_stopped_services: HashSet::new(),
|
||||||
|
storage_path,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Load existing data from storage
|
||||||
|
if let Err(e) = tracker.load_from_storage() {
|
||||||
|
warn!("Failed to load user-stopped services from storage: {}", e);
|
||||||
|
info!("Starting with empty user-stopped services list");
|
||||||
|
}
|
||||||
|
|
||||||
|
tracker
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a service as user-stopped
|
||||||
|
pub fn mark_user_stopped(&mut self, service_name: &str) -> Result<()> {
|
||||||
|
info!("Marking service '{}' as user-stopped", service_name);
|
||||||
|
self.user_stopped_services.insert(service_name.to_string());
|
||||||
|
self.save_to_storage()?;
|
||||||
|
debug!("Service '{}' marked as user-stopped and saved to storage", service_name);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear user-stopped flag for a service (when user starts it)
|
||||||
|
pub fn clear_user_stopped(&mut self, service_name: &str) -> Result<()> {
|
||||||
|
if self.user_stopped_services.remove(service_name) {
|
||||||
|
info!("Cleared user-stopped flag for service '{}'", service_name);
|
||||||
|
self.save_to_storage()?;
|
||||||
|
debug!("Service '{}' user-stopped flag cleared and saved to storage", service_name);
|
||||||
|
} else {
|
||||||
|
debug!("Service '{}' was not marked as user-stopped", service_name);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a service is marked as user-stopped
|
||||||
|
pub fn is_user_stopped(&self, service_name: &str) -> bool {
|
||||||
|
let is_stopped = self.user_stopped_services.contains(service_name);
|
||||||
|
debug!("Service '{}' user-stopped status: {}", service_name, is_stopped);
|
||||||
|
is_stopped
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Save current state to persistent storage
|
||||||
|
fn save_to_storage(&self) -> Result<()> {
|
||||||
|
// Create parent directory if it doesn't exist
|
||||||
|
if let Some(parent_dir) = Path::new(&self.storage_path).parent() {
|
||||||
|
if !parent_dir.exists() {
|
||||||
|
fs::create_dir_all(parent_dir)?;
|
||||||
|
debug!("Created parent directory: {}", parent_dir.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let data = UserStoppedData {
|
||||||
|
services: self.user_stopped_services.iter().cloned().collect(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let json_data = serde_json::to_string_pretty(&data)?;
|
||||||
|
fs::write(&self.storage_path, json_data)?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Saved {} user-stopped services to {}",
|
||||||
|
data.services.len(),
|
||||||
|
self.storage_path
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load state from persistent storage
|
||||||
|
fn load_from_storage(&mut self) -> Result<()> {
|
||||||
|
if !Path::new(&self.storage_path).exists() {
|
||||||
|
debug!("Storage file {} does not exist, starting fresh", self.storage_path);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let json_data = fs::read_to_string(&self.storage_path)?;
|
||||||
|
let data: UserStoppedData = serde_json::from_str(&json_data)?;
|
||||||
|
|
||||||
|
self.user_stopped_services = data.services.into_iter().collect();
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Loaded {} user-stopped services from {}",
|
||||||
|
self.user_stopped_services.len(),
|
||||||
|
self.storage_path
|
||||||
|
);
|
||||||
|
|
||||||
|
if !self.user_stopped_services.is_empty() {
|
||||||
|
debug!("User-stopped services: {:?}", self.user_stopped_services);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,210 +0,0 @@
|
|||||||
use std::time::Duration;
|
|
||||||
use chrono::Utc;
|
|
||||||
use gethostname::gethostname;
|
|
||||||
use tokio::time::interval;
|
|
||||||
use tracing::{info, error, warn};
|
|
||||||
use zmq::{Context, Socket, SocketType};
|
|
||||||
|
|
||||||
use crate::collectors::{
|
|
||||||
backup::BackupCollector,
|
|
||||||
service::ServiceCollector,
|
|
||||||
smart::SmartCollector,
|
|
||||||
system::SystemCollector,
|
|
||||||
Collector
|
|
||||||
};
|
|
||||||
use cm_dashboard_shared::envelope::AgentType;
|
|
||||||
use crate::discovery::AutoDiscovery;
|
|
||||||
use crate::notifications::{NotificationManager, NotificationConfig};
|
|
||||||
|
|
||||||
pub struct SimpleAgent {
|
|
||||||
hostname: String,
|
|
||||||
zmq_socket: Socket,
|
|
||||||
notification_manager: NotificationManager,
|
|
||||||
collectors: Vec<Box<dyn Collector + Send + Sync>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SimpleAgent {
|
|
||||||
pub async fn new() -> anyhow::Result<Self> {
|
|
||||||
let hostname = gethostname().to_string_lossy().to_string();
|
|
||||||
|
|
||||||
info!("Starting CM Dashboard Agent on {}", hostname);
|
|
||||||
|
|
||||||
// Setup ZMQ
|
|
||||||
let context = Context::new();
|
|
||||||
let socket = context.socket(SocketType::PUB)?;
|
|
||||||
socket.bind("tcp://0.0.0.0:6130")?;
|
|
||||||
info!("ZMQ publisher bound to tcp://0.0.0.0:6130");
|
|
||||||
|
|
||||||
// Setup notifications
|
|
||||||
let notification_config = NotificationConfig {
|
|
||||||
enabled: true,
|
|
||||||
smtp_host: "localhost".to_string(),
|
|
||||||
smtp_port: 25,
|
|
||||||
from_email: format!("{}@cmtec.se", hostname),
|
|
||||||
to_email: "cm@cmtec.se".to_string(),
|
|
||||||
rate_limit_minutes: 0, // Disabled for testing
|
|
||||||
};
|
|
||||||
let notification_manager = NotificationManager::new(notification_config.clone());
|
|
||||||
info!("Notifications: {} -> {}", notification_config.from_email, notification_config.to_email);
|
|
||||||
|
|
||||||
// Auto-discover and create collectors
|
|
||||||
let mut collectors: Vec<Box<dyn Collector + Send + Sync>> = Vec::new();
|
|
||||||
|
|
||||||
// SMART collector
|
|
||||||
let devices = AutoDiscovery::discover_storage_devices().await;
|
|
||||||
let valid_devices = AutoDiscovery::validate_devices(&devices).await;
|
|
||||||
if !valid_devices.is_empty() {
|
|
||||||
let smart_collector = SmartCollector::new(true, 5000, valid_devices.clone());
|
|
||||||
collectors.push(Box::new(smart_collector));
|
|
||||||
info!("SMART monitoring: {:?}", valid_devices);
|
|
||||||
} else {
|
|
||||||
warn!("No storage devices found - SMART monitoring disabled");
|
|
||||||
}
|
|
||||||
|
|
||||||
// System collector
|
|
||||||
let system_collector = SystemCollector::new(true, 5000);
|
|
||||||
collectors.push(Box::new(system_collector));
|
|
||||||
info!("System monitoring: CPU, memory, temperature, C-states");
|
|
||||||
|
|
||||||
// Service collector
|
|
||||||
let services = AutoDiscovery::discover_services().await;
|
|
||||||
let service_list = if !services.is_empty() {
|
|
||||||
services
|
|
||||||
} else {
|
|
||||||
vec!["ssh".to_string()] // Fallback to SSH only
|
|
||||||
};
|
|
||||||
let service_collector = ServiceCollector::new(true, 5000, service_list.clone());
|
|
||||||
collectors.push(Box::new(service_collector));
|
|
||||||
info!("Service monitoring: {:?}", service_list);
|
|
||||||
|
|
||||||
// Backup collector
|
|
||||||
let (backup_enabled, restic_repo, backup_service) =
|
|
||||||
AutoDiscovery::discover_backup_config(&hostname).await;
|
|
||||||
if backup_enabled {
|
|
||||||
let backup_collector = BackupCollector::new(true, 30000, restic_repo.clone(), backup_service.clone());
|
|
||||||
collectors.push(Box::new(backup_collector));
|
|
||||||
info!("Backup monitoring: repo={:?}, service={}", restic_repo, backup_service);
|
|
||||||
} else {
|
|
||||||
info!("Backup monitoring disabled (no backup system detected)");
|
|
||||||
}
|
|
||||||
|
|
||||||
info!("Agent initialized with {} collectors", collectors.len());
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
hostname,
|
|
||||||
zmq_socket: socket,
|
|
||||||
notification_manager,
|
|
||||||
collectors,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn run(&mut self) -> anyhow::Result<()> {
|
|
||||||
info!("Starting metrics collection...");
|
|
||||||
|
|
||||||
// Create collection tasks for each collector (unused for now)
|
|
||||||
let mut _tasks: Vec<tokio::task::JoinHandle<()>> = Vec::new();
|
|
||||||
|
|
||||||
for collector in &self.collectors {
|
|
||||||
let collector_name = collector.name().to_string();
|
|
||||||
let _agent_type = collector.agent_type();
|
|
||||||
let interval_duration = collector.collect_interval();
|
|
||||||
|
|
||||||
info!("{} collector: {}ms interval", collector_name, interval_duration.as_millis());
|
|
||||||
|
|
||||||
// Clone what we need for the task
|
|
||||||
let _hostname = self.hostname.clone();
|
|
||||||
|
|
||||||
// Create the collection task (we'll handle this differently since we can't clone collectors)
|
|
||||||
// For now, let's create a simpler approach
|
|
||||||
}
|
|
||||||
|
|
||||||
// For simplicity, let's run a main loop instead of separate tasks
|
|
||||||
let mut collection_interval = interval(Duration::from_millis(5000));
|
|
||||||
|
|
||||||
loop {
|
|
||||||
collection_interval.tick().await;
|
|
||||||
|
|
||||||
// Collect from all collectors
|
|
||||||
let mut outputs = Vec::new();
|
|
||||||
for collector in &self.collectors {
|
|
||||||
match collector.collect().await {
|
|
||||||
Ok(output) => {
|
|
||||||
// Send via ZMQ
|
|
||||||
if let Err(e) = self.send_metrics(&output.agent_type, &output.data).await {
|
|
||||||
error!("Failed to send metrics for {}: {}", collector.name(), e);
|
|
||||||
}
|
|
||||||
outputs.push(output);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
error!("Collection failed for {}: {}", collector.name(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process status changes after collection loop to avoid borrowing conflicts
|
|
||||||
for output in outputs {
|
|
||||||
self.check_status_changes(&output).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn send_metrics(&self, agent_type: &AgentType, data: &serde_json::Value) -> anyhow::Result<()> {
|
|
||||||
let message = serde_json::json!({
|
|
||||||
"hostname": self.hostname,
|
|
||||||
"agent_type": agent_type,
|
|
||||||
"timestamp": Utc::now().timestamp() as u64,
|
|
||||||
"metrics": data
|
|
||||||
});
|
|
||||||
|
|
||||||
let serialized = serde_json::to_string(&message)?;
|
|
||||||
self.zmq_socket.send(&serialized, 0)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn check_status_changes(&mut self, output: &crate::collectors::CollectorOutput) {
|
|
||||||
// Generic status change detection for all agents
|
|
||||||
self.scan_for_status_changes(&output.data, &format!("{:?}", output.agent_type)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn scan_for_status_changes(&mut self, data: &serde_json::Value, agent_name: &str) {
|
|
||||||
// Recursively scan JSON for any field ending in "_status"
|
|
||||||
self.scan_object_for_status(data, agent_name, "").await;
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn scan_object_for_status(&mut self, value: &serde_json::Value, agent_name: &str, path: &str) {
|
|
||||||
match value {
|
|
||||||
serde_json::Value::Object(obj) => {
|
|
||||||
for (key, val) in obj {
|
|
||||||
let current_path = if path.is_empty() { key.clone() } else { format!("{}.{}", path, key) };
|
|
||||||
|
|
||||||
if key.ends_with("_status") && val.is_string() {
|
|
||||||
// Found a status field - check for changes
|
|
||||||
if let Some(status) = val.as_str() {
|
|
||||||
let component = agent_name.to_lowercase();
|
|
||||||
let metric = key.trim_end_matches("_status");
|
|
||||||
let description = format!("Agent: {}, Component: {}, Source: {}", agent_name, component, current_path);
|
|
||||||
|
|
||||||
if let Some(change) = self.notification_manager.update_status_with_details(&component, metric, status, Some(description)) {
|
|
||||||
info!("Status change: {} {} -> {}", current_path, change.old_status, change.new_status);
|
|
||||||
self.notification_manager.send_notification(change).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Recursively scan nested objects
|
|
||||||
self.scan_object_for_status(val, agent_name, ¤t_path).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
serde_json::Value::Array(arr) => {
|
|
||||||
// Scan array elements for individual item status tracking
|
|
||||||
for (index, item) in arr.iter().enumerate() {
|
|
||||||
let item_path = format!("{}[{}]", path, index);
|
|
||||||
self.scan_object_for_status(item, agent_name, &item_path).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
422
agent/src/status/mod.rs
Normal file
422
agent/src/status/mod.rs
Normal file
@@ -0,0 +1,422 @@
|
|||||||
|
use cm_dashboard_shared::{Status, Metric};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tracing::{debug, info, error};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use chrono::Utc;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HostStatusConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub aggregation_method: String, // "worst_case"
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for HostStatusConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
enabled: true,
|
||||||
|
aggregation_method: "worst_case".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct StatusChangeSummary {
|
||||||
|
pub service_name: String,
|
||||||
|
pub initial_status: Status,
|
||||||
|
pub final_status: Status,
|
||||||
|
pub change_count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct AggregatedStatusChanges {
|
||||||
|
pub start_time: Instant,
|
||||||
|
pub end_time: Instant,
|
||||||
|
pub service_summaries: Vec<StatusChangeSummary>,
|
||||||
|
pub host_status_initial: Status,
|
||||||
|
pub host_status_final: Status,
|
||||||
|
pub requires_notification: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct HostStatusManager {
|
||||||
|
service_statuses: HashMap<String, Status>,
|
||||||
|
current_host_status: Status,
|
||||||
|
previous_host_status: Status,
|
||||||
|
last_status_change: Option<Instant>,
|
||||||
|
config: HostStatusConfig,
|
||||||
|
// Notification batching
|
||||||
|
pending_changes: HashMap<String, (Status, Status, usize)>, // service -> (initial_status, current_status, change_count)
|
||||||
|
batch_start_time: Option<Instant>,
|
||||||
|
batch_start_host_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HostStatusManager {
|
||||||
|
pub fn new(config: HostStatusConfig) -> Self {
|
||||||
|
info!("Initializing HostStatusManager with config: {:?}", config);
|
||||||
|
Self {
|
||||||
|
service_statuses: HashMap::new(),
|
||||||
|
current_host_status: Status::Unknown,
|
||||||
|
previous_host_status: Status::Unknown,
|
||||||
|
last_status_change: None,
|
||||||
|
config,
|
||||||
|
pending_changes: HashMap::new(),
|
||||||
|
batch_start_time: None,
|
||||||
|
batch_start_host_status: Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update the status of a specific service and recalculate host status
|
||||||
|
/// Updates real-time status and buffers changes for email notifications
|
||||||
|
pub fn update_service_status(&mut self, service: String, status: Status) {
|
||||||
|
if !self.config.enabled {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let old_service_status = self.service_statuses.get(&service).copied().unwrap_or(Status::Unknown);
|
||||||
|
|
||||||
|
// Only proceed if status actually changed
|
||||||
|
if old_service_status == status {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize batch if this is the first change
|
||||||
|
if self.batch_start_time.is_none() {
|
||||||
|
self.batch_start_time = Some(Instant::now());
|
||||||
|
self.batch_start_host_status = self.current_host_status;
|
||||||
|
debug!("Starting notification batch");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update real-time service status (for dashboard)
|
||||||
|
self.service_statuses.insert(service.clone(), status);
|
||||||
|
|
||||||
|
// Buffer change for email notifications
|
||||||
|
match self.pending_changes.entry(service.clone()) {
|
||||||
|
std::collections::hash_map::Entry::Occupied(mut entry) => {
|
||||||
|
// Service already has changes in this batch - update final status and increment count
|
||||||
|
let (initial_status, _current_status, change_count) = entry.get();
|
||||||
|
entry.insert((*initial_status, status, change_count + 1));
|
||||||
|
}
|
||||||
|
std::collections::hash_map::Entry::Vacant(entry) => {
|
||||||
|
// First change for this service in this batch
|
||||||
|
entry.insert((old_service_status, status, 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recalculate host status
|
||||||
|
let old_host_status = self.current_host_status;
|
||||||
|
self.previous_host_status = old_host_status;
|
||||||
|
self.current_host_status = self.calculate_host_status();
|
||||||
|
|
||||||
|
if old_host_status != self.current_host_status {
|
||||||
|
self.last_status_change = Some(Instant::now());
|
||||||
|
info!(
|
||||||
|
"Host status changed: {:?} -> {:?} (triggered by service '{}': {:?} -> {:?})",
|
||||||
|
old_host_status, self.current_host_status, service, old_service_status, status
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Service status updated: {} {:?} -> {:?}, host status: {:?}, pending notifications: {}",
|
||||||
|
service, old_service_status, status, self.current_host_status, self.pending_changes.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the current host status as a metric for broadcasting to dashboard
|
||||||
|
pub fn get_host_status_metric(&self) -> Metric {
|
||||||
|
Metric {
|
||||||
|
name: "host_status_summary".to_string(),
|
||||||
|
value: cm_dashboard_shared::MetricValue::String(format!(
|
||||||
|
"Host aggregated from {} services",
|
||||||
|
self.service_statuses.len()
|
||||||
|
)),
|
||||||
|
status: self.current_host_status,
|
||||||
|
timestamp: Utc::now().timestamp() as u64,
|
||||||
|
description: Some("Aggregated host status from all services".to_string()),
|
||||||
|
unit: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate the overall host status based on all service statuses
|
||||||
|
fn calculate_host_status(&self) -> Status {
|
||||||
|
if self.service_statuses.is_empty() {
|
||||||
|
return Status::Unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
match self.config.aggregation_method.as_str() {
|
||||||
|
"worst_case" => {
|
||||||
|
let statuses: Vec<Status> = self.service_statuses.values().copied().collect();
|
||||||
|
Status::aggregate(&statuses)
|
||||||
|
},
|
||||||
|
_ => {
|
||||||
|
debug!("Unknown aggregation method: {}, falling back to worst_case", self.config.aggregation_method);
|
||||||
|
let statuses: Vec<Status> = self.service_statuses.values().copied().collect();
|
||||||
|
Status::aggregate(&statuses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Process a metric - updates status and queues for aggregated notifications if status changed
|
||||||
|
pub async fn process_metric(&mut self, metric: &Metric, _notification_manager: &mut crate::notifications::NotificationManager) -> bool {
|
||||||
|
let old_service_status = self.service_statuses.get(&metric.name).copied();
|
||||||
|
let old_host_status = self.current_host_status;
|
||||||
|
let new_service_status = metric.status;
|
||||||
|
|
||||||
|
// Update status (this recalculates host status internally)
|
||||||
|
self.update_service_status(metric.name.clone(), new_service_status);
|
||||||
|
|
||||||
|
let new_host_status = self.current_host_status;
|
||||||
|
let mut status_changed = false;
|
||||||
|
|
||||||
|
// Check if service status actually changed (ignore first-time status setting)
|
||||||
|
if let Some(old_service_status) = old_service_status {
|
||||||
|
if old_service_status != new_service_status {
|
||||||
|
debug!("Service status change detected for {}: {:?} -> {:?}", metric.name, old_service_status, new_service_status);
|
||||||
|
|
||||||
|
// Queue change for aggregated notification (not immediate)
|
||||||
|
self.queue_status_change(&metric.name, old_service_status, new_service_status);
|
||||||
|
|
||||||
|
status_changed = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!("Initial status set for {}: {:?}", metric.name, new_service_status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if host status changed (this should trigger immediate transmission)
|
||||||
|
if old_host_status != new_host_status {
|
||||||
|
debug!("Host status change detected: {:?} -> {:?}", old_host_status, new_host_status);
|
||||||
|
status_changed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
status_changed // Return true if either service or host status changed
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Queue status change for aggregated notification
|
||||||
|
fn queue_status_change(&mut self, metric_name: &str, old_status: Status, new_status: Status) {
|
||||||
|
// Add to pending changes for aggregated notification
|
||||||
|
let entry = self.pending_changes.entry(metric_name.to_string()).or_insert((old_status, old_status, 0));
|
||||||
|
entry.1 = new_status; // Update final status
|
||||||
|
entry.2 += 1; // Increment change count
|
||||||
|
|
||||||
|
// Set batch start time if this is the first change
|
||||||
|
if self.batch_start_time.is_none() {
|
||||||
|
self.batch_start_time = Some(Instant::now());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Process pending notifications - legacy method, now rarely used
|
||||||
|
pub async fn process_pending_notifications(&mut self, notification_manager: &mut crate::notifications::NotificationManager) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
|
if !self.config.enabled || self.pending_changes.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process notifications immediately without interval batching
|
||||||
|
|
||||||
|
// Create aggregated status changes
|
||||||
|
let aggregated = self.create_aggregated_changes();
|
||||||
|
|
||||||
|
if aggregated.requires_notification {
|
||||||
|
info!("Sending aggregated notification for {} service changes", aggregated.service_summaries.len());
|
||||||
|
|
||||||
|
// Send aggregated notification
|
||||||
|
if let Err(e) = self.send_aggregated_email(&aggregated, notification_manager).await {
|
||||||
|
error!("Failed to send aggregated notification: {}", e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!("No significant changes requiring notification in batch of {} changes", self.pending_changes.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the batch
|
||||||
|
self.clear_notification_batch();
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create aggregated status changes from pending buffer
|
||||||
|
fn create_aggregated_changes(&self) -> AggregatedStatusChanges {
|
||||||
|
let mut service_summaries = Vec::new();
|
||||||
|
let mut requires_notification = false;
|
||||||
|
|
||||||
|
for (service_name, (initial_status, final_status, change_count)) in &self.pending_changes {
|
||||||
|
let significant_change = self.is_significant_change(*initial_status, *final_status);
|
||||||
|
if significant_change {
|
||||||
|
requires_notification = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
service_summaries.push(StatusChangeSummary {
|
||||||
|
service_name: service_name.clone(),
|
||||||
|
initial_status: *initial_status,
|
||||||
|
final_status: *final_status,
|
||||||
|
change_count: *change_count,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also check if host status change is significant
|
||||||
|
if self.is_significant_change(self.batch_start_host_status, self.current_host_status) {
|
||||||
|
requires_notification = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
AggregatedStatusChanges {
|
||||||
|
start_time: self.batch_start_time.unwrap_or_else(Instant::now),
|
||||||
|
end_time: Instant::now(),
|
||||||
|
service_summaries,
|
||||||
|
host_status_initial: self.batch_start_host_status,
|
||||||
|
host_status_final: self.current_host_status,
|
||||||
|
requires_notification,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a status change is significant enough for notification
|
||||||
|
fn is_significant_change(&self, old_status: Status, new_status: Status) -> bool {
|
||||||
|
match (old_status, new_status) {
|
||||||
|
// Don't notify on transitions from Unknown (startup/restart scenario)
|
||||||
|
(Status::Unknown, _) => false,
|
||||||
|
// Always notify on problems (but not from Unknown)
|
||||||
|
(_, Status::Warning) | (_, Status::Critical) => true,
|
||||||
|
// Only notify on recovery if it's from a problem state to OK and all services are OK
|
||||||
|
(Status::Warning | Status::Critical, Status::Ok) => self.current_host_status == Status::Ok,
|
||||||
|
// Don't notify on other transitions
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn send_aggregated_email(
|
||||||
|
&self,
|
||||||
|
aggregated: &AggregatedStatusChanges,
|
||||||
|
notification_manager: &mut crate::notifications::NotificationManager,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
|
let mut summary_parts = Vec::new();
|
||||||
|
let critical_count = aggregated.service_summaries.iter().filter(|s| s.final_status == Status::Critical).count();
|
||||||
|
let warning_count = aggregated.service_summaries.iter().filter(|s| s.final_status == Status::Warning).count();
|
||||||
|
let recovery_count = aggregated.service_summaries.iter().filter(|s|
|
||||||
|
matches!((s.initial_status, s.final_status), (Status::Warning | Status::Critical, Status::Ok))
|
||||||
|
).count();
|
||||||
|
let startup_count = aggregated.service_summaries.iter().filter(|s|
|
||||||
|
matches!((s.initial_status, s.final_status), (Status::Unknown, Status::Ok | Status::Pending))
|
||||||
|
).count();
|
||||||
|
|
||||||
|
if critical_count > 0 { summary_parts.push(format!("{} critical", critical_count)); }
|
||||||
|
if warning_count > 0 { summary_parts.push(format!("{} warning", warning_count)); }
|
||||||
|
if recovery_count > 0 { summary_parts.push(format!("{} recovered", recovery_count)); }
|
||||||
|
if startup_count > 0 { summary_parts.push(format!("{} started", startup_count)); }
|
||||||
|
|
||||||
|
let summary_text = if summary_parts.is_empty() {
|
||||||
|
format!("{} service changes", aggregated.service_summaries.len())
|
||||||
|
} else {
|
||||||
|
summary_parts.join(", ")
|
||||||
|
};
|
||||||
|
|
||||||
|
let subject = format!("Status Alert: {}", summary_text);
|
||||||
|
let body = self.format_aggregated_details(aggregated);
|
||||||
|
|
||||||
|
notification_manager.send_direct_email(&subject, &body).await.map_err(|e| e.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format details for aggregated notification
|
||||||
|
fn format_aggregated_details(&self, aggregated: &AggregatedStatusChanges) -> String {
|
||||||
|
let mut details = String::new();
|
||||||
|
|
||||||
|
let duration = aggregated.end_time.duration_since(aggregated.start_time).as_secs();
|
||||||
|
details.push_str(&format!(
|
||||||
|
"Status Summary ({}s duration)\n",
|
||||||
|
duration
|
||||||
|
));
|
||||||
|
|
||||||
|
if aggregated.host_status_initial != aggregated.host_status_final {
|
||||||
|
details.push_str(&format!(
|
||||||
|
"Host Status: {:?} → {:?}\n\n",
|
||||||
|
aggregated.host_status_initial,
|
||||||
|
aggregated.host_status_final
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group services by change type
|
||||||
|
let mut critical_changes = Vec::new();
|
||||||
|
let mut warning_changes = Vec::new();
|
||||||
|
let mut recovery_changes = Vec::new();
|
||||||
|
let mut startup_changes = Vec::new();
|
||||||
|
let mut other_changes = Vec::new();
|
||||||
|
|
||||||
|
for summary in &aggregated.service_summaries {
|
||||||
|
let change_info = format!(
|
||||||
|
"{}: {:?} → {:?}{}",
|
||||||
|
summary.service_name,
|
||||||
|
summary.initial_status,
|
||||||
|
summary.final_status,
|
||||||
|
if summary.change_count > 1 { format!(" ({} changes)", summary.change_count) } else { String::new() }
|
||||||
|
);
|
||||||
|
|
||||||
|
match (summary.initial_status, summary.final_status) {
|
||||||
|
(_, Status::Critical) => critical_changes.push(change_info),
|
||||||
|
(_, Status::Warning) => warning_changes.push(change_info),
|
||||||
|
(Status::Warning | Status::Critical, Status::Ok) => recovery_changes.push(change_info),
|
||||||
|
(Status::Unknown, Status::Ok | Status::Pending) => startup_changes.push(change_info),
|
||||||
|
_ => other_changes.push(change_info),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show critical problems first
|
||||||
|
if !critical_changes.is_empty() {
|
||||||
|
details.push_str(&format!("🔴 CRITICAL ISSUES ({}):\n", critical_changes.len()));
|
||||||
|
for change in critical_changes {
|
||||||
|
details.push_str(&format!(" {}\n", change));
|
||||||
|
}
|
||||||
|
details.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show warnings
|
||||||
|
if !warning_changes.is_empty() {
|
||||||
|
details.push_str(&format!("🟡 WARNINGS ({}):\n", warning_changes.len()));
|
||||||
|
for change in warning_changes {
|
||||||
|
details.push_str(&format!(" {}\n", change));
|
||||||
|
}
|
||||||
|
details.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show recoveries only if host status is now OK (all services recovered)
|
||||||
|
if !recovery_changes.is_empty() && aggregated.host_status_final == Status::Ok {
|
||||||
|
details.push_str(&format!("✅ RECOVERIES ({}):\n", recovery_changes.len()));
|
||||||
|
for change in recovery_changes {
|
||||||
|
details.push_str(&format!(" {}\n", change));
|
||||||
|
}
|
||||||
|
details.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show startups (usually not important but good to know)
|
||||||
|
if !startup_changes.is_empty() {
|
||||||
|
details.push_str(&format!("🟢 SERVICE STARTUPS ({}):\n", startup_changes.len()));
|
||||||
|
for change in startup_changes {
|
||||||
|
details.push_str(&format!(" {}\n", change));
|
||||||
|
}
|
||||||
|
details.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show other changes
|
||||||
|
if !other_changes.is_empty() {
|
||||||
|
details.push_str(&format!("ℹ️ OTHER CHANGES ({}):\n", other_changes.len()));
|
||||||
|
for change in other_changes {
|
||||||
|
details.push_str(&format!(" {}\n", change));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
details
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear the notification batch
|
||||||
|
fn clear_notification_batch(&mut self) {
|
||||||
|
self.pending_changes.clear();
|
||||||
|
self.batch_start_time = None;
|
||||||
|
self.batch_start_host_status = self.current_host_status;
|
||||||
|
debug!("Cleared notification batch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests temporarily disabled due to API changes
|
||||||
|
// The functionality works as tested manually
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
// Tests will be updated to match the new notification batching API
|
||||||
|
}
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
# CM Dashboard Agent Configuration
|
|
||||||
# Example configuration file for the ZMQ metrics agent
|
|
||||||
|
|
||||||
[agent]
|
|
||||||
# Hostname to advertise in metrics (auto-detected if not specified)
|
|
||||||
hostname = "srv01"
|
|
||||||
|
|
||||||
# Log level: trace, debug, info, warn, error
|
|
||||||
log_level = "info"
|
|
||||||
|
|
||||||
# Maximum number of metrics to buffer before dropping
|
|
||||||
metrics_buffer_size = 1000
|
|
||||||
|
|
||||||
[zmq]
|
|
||||||
# ZMQ publisher port
|
|
||||||
port = 6130
|
|
||||||
|
|
||||||
# Bind address (0.0.0.0 for all interfaces, 127.0.0.1 for localhost only)
|
|
||||||
bind_address = "0.0.0.0"
|
|
||||||
|
|
||||||
# ZMQ socket timeouts in milliseconds
|
|
||||||
send_timeout_ms = 5000
|
|
||||||
receive_timeout_ms = 5000
|
|
||||||
|
|
||||||
[collectors.smart]
|
|
||||||
# Enable SMART metrics collection (disk health, temperature, wear)
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
# Collection interval in milliseconds (minimum 1000ms)
|
|
||||||
interval_ms = 5000
|
|
||||||
|
|
||||||
# List of storage devices to monitor (without /dev/ prefix)
|
|
||||||
devices = ["nvme0n1", "sda", "sdb"]
|
|
||||||
|
|
||||||
# Timeout for smartctl commands in milliseconds
|
|
||||||
timeout_ms = 30000
|
|
||||||
|
|
||||||
[collectors.service]
|
|
||||||
# Enable service metrics collection (systemd services)
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
# Collection interval in milliseconds (minimum 500ms)
|
|
||||||
interval_ms = 5000
|
|
||||||
|
|
||||||
# List of systemd services to monitor
|
|
||||||
services = [
|
|
||||||
"gitea",
|
|
||||||
"immich",
|
|
||||||
"vaultwarden",
|
|
||||||
"unifi",
|
|
||||||
"smart-metrics-api",
|
|
||||||
"service-metrics-api",
|
|
||||||
"backup-metrics-api"
|
|
||||||
]
|
|
||||||
|
|
||||||
# Timeout for systemctl commands in milliseconds
|
|
||||||
timeout_ms = 10000
|
|
||||||
|
|
||||||
[collectors.backup]
|
|
||||||
# Enable backup metrics collection (restic integration)
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
# Collection interval in milliseconds (minimum 5000ms)
|
|
||||||
interval_ms = 30000
|
|
||||||
|
|
||||||
# Restic repository path (leave empty to disable restic integration)
|
|
||||||
restic_repo = "/srv/backups/restic"
|
|
||||||
|
|
||||||
# Systemd service name for backup monitoring
|
|
||||||
backup_service = "restic-backup"
|
|
||||||
|
|
||||||
# Timeout for restic and backup commands in milliseconds
|
|
||||||
timeout_ms = 30000
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
# CM Dashboard configuration template
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
enabled = true
|
|
||||||
# metadata = { rack = "R1" }
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[dashboard]
|
|
||||||
tick_rate_ms = 250
|
|
||||||
history_duration_minutes = 60
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "nvme"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "services"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "backup"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "alerts"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[data_source]
|
|
||||||
kind = "zmq"
|
|
||||||
|
|
||||||
[data_source.zmq]
|
|
||||||
endpoints = ["tcp://127.0.0.1:6130"]
|
|
||||||
# subscribe = ""
|
|
||||||
|
|
||||||
[filesystem]
|
|
||||||
# cache_dir = "/var/lib/cm-dashboard/cache"
|
|
||||||
# history_dir = "/var/lib/cm-dashboard/history"
|
|
||||||
@@ -1,39 +0,0 @@
|
|||||||
# CM Dashboard configuration
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
base_url = "http://srv01.local"
|
|
||||||
enabled = true
|
|
||||||
# metadata = { rack = "R1" }
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
base_url = "http://labbox.local"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[dashboard]
|
|
||||||
tick_rate_ms = 250
|
|
||||||
history_duration_minutes = 60
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "nvme"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "services"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "backup"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "alerts"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[filesystem]
|
|
||||||
# cache_dir = "/var/lib/cm-dashboard/cache"
|
|
||||||
# history_dir = "/var/lib/cm-dashboard/history"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# Hosts configuration template (optional if you want a separate hosts file)
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
enabled = true
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
# Optional separate hosts configuration
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
base_url = "http://srv01.local"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
base_url = "http://labbox.local"
|
|
||||||
enabled = true
|
|
||||||
@@ -1,21 +1,21 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cm-dashboard"
|
name = "cm-dashboard"
|
||||||
version = "0.1.0"
|
version = "0.1.50"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
cm-dashboard-shared = { path = "../shared" }
|
cm-dashboard-shared = { workspace = true }
|
||||||
ratatui = "0.24"
|
tokio = { workspace = true }
|
||||||
crossterm = "0.27"
|
serde = { workspace = true }
|
||||||
tokio = { version = "1.0", features = ["full"] }
|
serde_json = { workspace = true }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
thiserror = { workspace = true }
|
||||||
serde_json = "1.0"
|
anyhow = { workspace = true }
|
||||||
clap = { version = "4.0", features = ["derive"] }
|
chrono = { workspace = true }
|
||||||
anyhow = "1.0"
|
clap = { workspace = true }
|
||||||
chrono = { version = "0.4", features = ["serde"] }
|
zmq = { workspace = true }
|
||||||
toml = "0.8"
|
tracing = { workspace = true }
|
||||||
tracing = "0.1"
|
tracing-subscriber = { workspace = true }
|
||||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
ratatui = { workspace = true }
|
||||||
tracing-appender = "0.2"
|
crossterm = { workspace = true }
|
||||||
zmq = "0.10"
|
toml = { workspace = true }
|
||||||
gethostname = "0.4"
|
gethostname = { workspace = true }
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
# CM Dashboard configuration
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
enabled = true
|
|
||||||
# metadata = { rack = "R1" }
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[dashboard]
|
|
||||||
tick_rate_ms = 250
|
|
||||||
history_duration_minutes = 60
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "nvme"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "services"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "backup"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "alerts"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[data_source]
|
|
||||||
kind = "zmq"
|
|
||||||
|
|
||||||
[data_source.zmq]
|
|
||||||
endpoints = [
|
|
||||||
"tcp://srv01:6130", # srv01
|
|
||||||
"tcp://cmbox:6130", # cmbox
|
|
||||||
"tcp://simonbox:6130", # simonbox
|
|
||||||
"tcp://steambox:6130", # steambox
|
|
||||||
"tcp://labbox:6130", # labbox
|
|
||||||
]
|
|
||||||
|
|
||||||
[filesystem]
|
|
||||||
# cache_dir = "/var/lib/cm-dashboard/cache"
|
|
||||||
# history_dir = "/var/lib/cm-dashboard/history"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
# Optional separate hosts configuration
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
enabled = true
|
|
||||||
@@ -1,645 +1,335 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::time::{Duration, Instant};
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use chrono::{DateTime, Utc};
|
use crossterm::{
|
||||||
use crossterm::event::{KeyCode, KeyEvent, KeyEventKind};
|
event::{self},
|
||||||
use gethostname::gethostname;
|
execute,
|
||||||
|
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
|
||||||
|
};
|
||||||
|
use ratatui::{backend::CrosstermBackend, Terminal};
|
||||||
|
use std::io;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
use crate::config;
|
use crate::communication::{AgentCommand, ServiceAction, ZmqCommandSender, ZmqConsumer};
|
||||||
use crate::data::config::{AppConfig, DataSourceKind, HostTarget, ZmqConfig, DEFAULT_HOSTS};
|
use crate::config::DashboardConfig;
|
||||||
use crate::data::history::MetricsHistory;
|
use crate::metrics::MetricStore;
|
||||||
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics};
|
use crate::ui::{TuiApp, UiCommand};
|
||||||
|
|
||||||
// Host connection timeout - if no data received for this duration, mark as timeout
|
pub struct Dashboard {
|
||||||
// Keep-alive mechanism: agents send data every 5 seconds, timeout after 15 seconds
|
zmq_consumer: ZmqConsumer,
|
||||||
const HOST_CONNECTION_TIMEOUT: Duration = Duration::from_secs(15);
|
zmq_command_sender: ZmqCommandSender,
|
||||||
|
metric_store: MetricStore,
|
||||||
/// Shared application settings derived from the CLI arguments.
|
tui_app: Option<TuiApp>,
|
||||||
#[derive(Debug, Clone)]
|
terminal: Option<Terminal<CrosstermBackend<io::Stdout>>>,
|
||||||
pub struct AppOptions {
|
headless: bool,
|
||||||
pub config: Option<PathBuf>,
|
initial_commands_sent: std::collections::HashSet<String>,
|
||||||
pub host: Option<String>,
|
_config: DashboardConfig,
|
||||||
pub tick_rate: Duration,
|
|
||||||
pub verbosity: u8,
|
|
||||||
pub zmq_endpoints_override: Vec<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppOptions {
|
impl Dashboard {
|
||||||
pub fn tick_rate(&self) -> Duration {
|
pub async fn new(config_path: Option<String>, headless: bool) -> Result<Self> {
|
||||||
self.tick_rate
|
info!("Initializing dashboard");
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
// Load configuration - try default path if not specified
|
||||||
struct HostRuntimeState {
|
let config = match config_path {
|
||||||
last_success: Option<DateTime<Utc>>,
|
Some(path) => DashboardConfig::load_from_file(&path)?,
|
||||||
last_error: Option<String>,
|
None => {
|
||||||
connection_status: ConnectionStatus,
|
// Try default NixOS config path
|
||||||
smart: Option<SmartMetrics>,
|
let default_path = "/etc/cm-dashboard/dashboard.toml";
|
||||||
services: Option<ServiceMetrics>,
|
match DashboardConfig::load_from_file(default_path) {
|
||||||
system: Option<SystemMetrics>,
|
Ok(config) => {
|
||||||
backup: Option<BackupMetrics>,
|
info!("Using default config file: {}", default_path);
|
||||||
}
|
config
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Configuration file is required. Use --config to specify path or ensure {} exists.", default_path);
|
||||||
|
error!("Failed to load default config: {}", e);
|
||||||
|
return Err(anyhow::anyhow!("Missing required configuration file"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
// Initialize ZMQ consumer
|
||||||
pub enum ConnectionStatus {
|
let mut zmq_consumer = match ZmqConsumer::new(&config.zmq).await {
|
||||||
#[default]
|
Ok(consumer) => consumer,
|
||||||
Unknown,
|
Err(e) => {
|
||||||
Connected,
|
error!("Failed to initialize ZMQ consumer: {}", e);
|
||||||
Timeout,
|
return Err(e);
|
||||||
Error,
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
/// Top-level application state container.
|
// Initialize ZMQ command sender
|
||||||
#[derive(Debug)]
|
let zmq_command_sender = match ZmqCommandSender::new(&config.zmq) {
|
||||||
pub struct App {
|
Ok(sender) => sender,
|
||||||
options: AppOptions,
|
Err(e) => {
|
||||||
#[allow(dead_code)]
|
error!("Failed to initialize ZMQ command sender: {}", e);
|
||||||
config: Option<AppConfig>,
|
return Err(e);
|
||||||
#[allow(dead_code)]
|
}
|
||||||
active_config_path: Option<PathBuf>,
|
};
|
||||||
hosts: Vec<HostTarget>,
|
|
||||||
history: MetricsHistory,
|
|
||||||
host_states: HashMap<String, HostRuntimeState>,
|
|
||||||
zmq_endpoints: Vec<String>,
|
|
||||||
zmq_subscription: Option<String>,
|
|
||||||
zmq_connected: bool,
|
|
||||||
active_host_index: usize,
|
|
||||||
show_help: bool,
|
|
||||||
should_quit: bool,
|
|
||||||
last_tick: Instant,
|
|
||||||
tick_count: u64,
|
|
||||||
status: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl App {
|
// Connect to predefined hosts from configuration
|
||||||
pub fn new(options: AppOptions) -> Result<Self> {
|
let hosts = config.hosts.predefined_hosts.clone();
|
||||||
let (config, active_config_path) = Self::load_configuration(options.config.as_ref())?;
|
|
||||||
|
|
||||||
let hosts = Self::select_hosts(options.host.as_ref(), config.as_ref());
|
// Try to connect to hosts but don't fail if none are available
|
||||||
let history_capacity = Self::history_capacity_hint(config.as_ref());
|
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
|
||||||
let history = MetricsHistory::with_capacity(history_capacity);
|
Ok(_) => info!("Successfully connected to ZMQ hosts"),
|
||||||
let host_states = hosts
|
Err(e) => {
|
||||||
.iter()
|
warn!(
|
||||||
.map(|host| (host.name.clone(), HostRuntimeState::default()))
|
"Failed to connect to hosts (this is normal if no agents are running): {}",
|
||||||
.collect::<HashMap<_, _>>();
|
e
|
||||||
|
);
|
||||||
let (mut zmq_endpoints, zmq_subscription) = Self::resolve_zmq_config(config.as_ref());
|
info!("Dashboard will start anyway and connect when agents become available");
|
||||||
if !options.zmq_endpoints_override.is_empty() {
|
}
|
||||||
zmq_endpoints = options.zmq_endpoints_override.clone();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let status = Self::build_initial_status(options.host.as_ref(), active_config_path.as_ref());
|
// Initialize metric store
|
||||||
|
let metric_store = MetricStore::new(10000, 24); // 10k metrics, 24h retention
|
||||||
|
|
||||||
|
// Initialize TUI components only if not headless
|
||||||
|
let (tui_app, terminal) = if headless {
|
||||||
|
info!("Running in headless mode (no TUI)");
|
||||||
|
(None, None)
|
||||||
|
} else {
|
||||||
|
// Initialize TUI app
|
||||||
|
let tui_app = TuiApp::new(config.clone());
|
||||||
|
|
||||||
|
// Setup terminal
|
||||||
|
if let Err(e) = enable_raw_mode() {
|
||||||
|
error!("Failed to enable raw mode: {}", e);
|
||||||
|
error!(
|
||||||
|
"This usually means the dashboard is being run without a proper terminal (TTY)"
|
||||||
|
);
|
||||||
|
error!("Try running with --headless flag or in a proper terminal");
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut stdout = io::stdout();
|
||||||
|
if let Err(e) = execute!(stdout, EnterAlternateScreen) {
|
||||||
|
error!("Failed to enter alternate screen: {}", e);
|
||||||
|
let _ = disable_raw_mode();
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let backend = CrosstermBackend::new(stdout);
|
||||||
|
let terminal = match Terminal::new(backend) {
|
||||||
|
Ok(term) => term,
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to create terminal: {}", e);
|
||||||
|
let _ = disable_raw_mode();
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
(Some(tui_app), Some(terminal))
|
||||||
|
};
|
||||||
|
|
||||||
|
info!("Dashboard initialization complete");
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
options,
|
zmq_consumer,
|
||||||
config,
|
zmq_command_sender,
|
||||||
active_config_path,
|
metric_store,
|
||||||
hosts,
|
tui_app,
|
||||||
history,
|
terminal,
|
||||||
host_states,
|
headless,
|
||||||
zmq_endpoints,
|
initial_commands_sent: std::collections::HashSet::new(),
|
||||||
zmq_subscription,
|
_config: config,
|
||||||
zmq_connected: false,
|
|
||||||
active_host_index: 0,
|
|
||||||
show_help: false,
|
|
||||||
should_quit: false,
|
|
||||||
last_tick: Instant::now(),
|
|
||||||
tick_count: 0,
|
|
||||||
status,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn on_tick(&mut self) {
|
/// Send a command to a specific agent
|
||||||
self.tick_count = self.tick_count.saturating_add(1);
|
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||||
self.last_tick = Instant::now();
|
self.zmq_command_sender
|
||||||
|
.send_command(hostname, command)
|
||||||
// Check for host connection timeouts
|
.await
|
||||||
self.check_host_timeouts();
|
|
||||||
|
|
||||||
let host_count = self.hosts.len();
|
|
||||||
let retention = self.history.retention();
|
|
||||||
self.status = format!(
|
|
||||||
"Monitoring • hosts: {} • refresh: {:?} • retention: {:?}",
|
|
||||||
host_count, self.options.tick_rate, retention
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn handle_key_event(&mut self, key: KeyEvent) {
|
pub async fn run(&mut self) -> Result<()> {
|
||||||
if key.kind != KeyEventKind::Press {
|
info!("Starting dashboard main loop");
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
match key.code {
|
let mut last_metrics_check = Instant::now();
|
||||||
KeyCode::Char('q') | KeyCode::Char('Q') | KeyCode::Esc => {
|
let metrics_check_interval = Duration::from_millis(100); // Check for metrics every 100ms
|
||||||
self.should_quit = true;
|
|
||||||
self.status = "Exiting…".to_string();
|
|
||||||
}
|
|
||||||
KeyCode::Char('r') | KeyCode::Char('R') => {
|
|
||||||
self.status = "Manual refresh requested".to_string();
|
|
||||||
}
|
|
||||||
KeyCode::Left | KeyCode::Char('h') => {
|
|
||||||
self.select_previous_host();
|
|
||||||
}
|
|
||||||
KeyCode::Right | KeyCode::Char('l') | KeyCode::Tab => {
|
|
||||||
self.select_next_host();
|
|
||||||
}
|
|
||||||
KeyCode::Char('?') => {
|
|
||||||
self.show_help = !self.show_help;
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn should_quit(&self) -> bool {
|
loop {
|
||||||
self.should_quit
|
// Handle terminal events (keyboard input) only if not headless
|
||||||
}
|
if !self.headless {
|
||||||
|
match event::poll(Duration::from_millis(50)) {
|
||||||
#[allow(dead_code)]
|
Ok(true) => {
|
||||||
pub fn status_text(&self) -> &str {
|
match event::read() {
|
||||||
&self.status
|
Ok(event) => {
|
||||||
}
|
if let Some(ref mut tui_app) = self.tui_app {
|
||||||
|
// Handle input and check for commands
|
||||||
#[allow(dead_code)]
|
match tui_app.handle_input(event) {
|
||||||
pub fn zmq_connected(&self) -> bool {
|
Ok(Some(command)) => {
|
||||||
self.zmq_connected
|
// Execute the command
|
||||||
}
|
if let Err(e) = self.execute_ui_command(command).await {
|
||||||
|
error!("Failed to execute UI command: {}", e);
|
||||||
pub fn tick_rate(&self) -> Duration {
|
}
|
||||||
self.options.tick_rate()
|
}
|
||||||
}
|
Ok(None) => {
|
||||||
|
// No command, check if we should quit
|
||||||
#[allow(dead_code)]
|
if tui_app.should_quit() {
|
||||||
pub fn config(&self) -> Option<&AppConfig> {
|
info!("Quit requested, exiting dashboard");
|
||||||
self.config.as_ref()
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#[allow(dead_code)]
|
Err(e) => {
|
||||||
pub fn active_config_path(&self) -> Option<&PathBuf> {
|
error!("Error handling input: {}", e);
|
||||||
self.active_config_path.as_ref()
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#[allow(dead_code)]
|
}
|
||||||
pub fn hosts(&self) -> &[HostTarget] {
|
Err(e) => {
|
||||||
&self.hosts
|
error!("Error reading terminal event: {}", e);
|
||||||
}
|
break;
|
||||||
|
}
|
||||||
pub fn active_host_info(&self) -> Option<(usize, &HostTarget)> {
|
|
||||||
if self.hosts.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
let index = self
|
|
||||||
.active_host_index
|
|
||||||
.min(self.hosts.len().saturating_sub(1));
|
|
||||||
Some((index, &self.hosts[index]))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn history(&self) -> &MetricsHistory {
|
|
||||||
&self.history
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn host_display_data(&self) -> Vec<HostDisplayData> {
|
|
||||||
self.hosts
|
|
||||||
.iter()
|
|
||||||
.filter_map(|host| {
|
|
||||||
self.host_states
|
|
||||||
.get(&host.name)
|
|
||||||
.and_then(|state| {
|
|
||||||
// Only show hosts that have successfully connected at least once
|
|
||||||
if state.last_success.is_some() {
|
|
||||||
Some(HostDisplayData {
|
|
||||||
name: host.name.clone(),
|
|
||||||
last_success: state.last_success.clone(),
|
|
||||||
last_error: state.last_error.clone(),
|
|
||||||
connection_status: state.connection_status.clone(),
|
|
||||||
smart: state.smart.clone(),
|
|
||||||
services: state.services.clone(),
|
|
||||||
system: state.system.clone(),
|
|
||||||
backup: state.backup.clone(),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
})
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn active_host_display(&self) -> Option<HostDisplayData> {
|
|
||||||
self.active_host_info().and_then(|(_, host)| {
|
|
||||||
self.host_states
|
|
||||||
.get(&host.name)
|
|
||||||
.map(|state| HostDisplayData {
|
|
||||||
name: host.name.clone(),
|
|
||||||
last_success: state.last_success.clone(),
|
|
||||||
last_error: state.last_error.clone(),
|
|
||||||
connection_status: state.connection_status.clone(),
|
|
||||||
smart: state.smart.clone(),
|
|
||||||
services: state.services.clone(),
|
|
||||||
system: state.system.clone(),
|
|
||||||
backup: state.backup.clone(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn zmq_context(&self) -> Option<ZmqContext> {
|
|
||||||
if self.zmq_endpoints.is_empty() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(ZmqContext::new(
|
|
||||||
self.zmq_endpoints.clone(),
|
|
||||||
self.zmq_subscription.clone(),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn handle_app_event(&mut self, event: AppEvent) {
|
|
||||||
match event {
|
|
||||||
AppEvent::Shutdown => {
|
|
||||||
self.should_quit = true;
|
|
||||||
self.status = "Shutting down…".to_string();
|
|
||||||
}
|
|
||||||
AppEvent::MetricsUpdated {
|
|
||||||
host,
|
|
||||||
smart,
|
|
||||||
services,
|
|
||||||
system,
|
|
||||||
backup,
|
|
||||||
timestamp,
|
|
||||||
} => {
|
|
||||||
self.zmq_connected = true;
|
|
||||||
self.ensure_host_entry(&host);
|
|
||||||
let state = self.host_states.entry(host.clone()).or_default();
|
|
||||||
state.last_success = Some(timestamp);
|
|
||||||
state.last_error = None;
|
|
||||||
state.connection_status = ConnectionStatus::Connected;
|
|
||||||
|
|
||||||
if let Some(mut smart_metrics) = smart {
|
|
||||||
if smart_metrics.timestamp != timestamp {
|
|
||||||
smart_metrics.timestamp = timestamp;
|
|
||||||
}
|
}
|
||||||
let snapshot = smart_metrics.clone();
|
Ok(false) => {} // No events available (timeout)
|
||||||
self.history.record_smart(smart_metrics);
|
Err(e) => {
|
||||||
state.smart = Some(snapshot);
|
error!("Error polling for terminal events: {}", e);
|
||||||
}
|
break;
|
||||||
|
|
||||||
if let Some(mut service_metrics) = services {
|
|
||||||
if service_metrics.timestamp != timestamp {
|
|
||||||
service_metrics.timestamp = timestamp;
|
|
||||||
}
|
}
|
||||||
let snapshot = service_metrics.clone();
|
|
||||||
|
|
||||||
// No more need for dashboard-side description caching since agent handles it
|
|
||||||
|
|
||||||
self.history.record_services(service_metrics);
|
|
||||||
state.services = Some(snapshot);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(system_metrics) = system {
|
// Check for new metrics
|
||||||
// Convert timestamp format (u64 to DateTime<Utc>)
|
if last_metrics_check.elapsed() >= metrics_check_interval {
|
||||||
let system_snapshot = SystemMetrics {
|
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
|
||||||
summary: system_metrics.summary,
|
debug!(
|
||||||
timestamp: system_metrics.timestamp,
|
"Received metrics from {}: {} metrics",
|
||||||
};
|
metric_message.hostname,
|
||||||
self.history.record_system(system_snapshot.clone());
|
metric_message.metrics.len()
|
||||||
state.system = Some(system_snapshot);
|
);
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(mut backup_metrics) = backup {
|
// Check if this is the first time we've seen this host
|
||||||
if backup_metrics.timestamp != timestamp {
|
let is_new_host = !self
|
||||||
backup_metrics.timestamp = timestamp;
|
.initial_commands_sent
|
||||||
|
.contains(&metric_message.hostname);
|
||||||
|
|
||||||
|
if is_new_host {
|
||||||
|
info!(
|
||||||
|
"First contact with host {}, sending initial CollectNow command",
|
||||||
|
metric_message.hostname
|
||||||
|
);
|
||||||
|
|
||||||
|
// Send CollectNow command for immediate refresh
|
||||||
|
if let Err(e) = self
|
||||||
|
.send_command(&metric_message.hostname, AgentCommand::CollectNow)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
error!(
|
||||||
|
"Failed to send initial CollectNow command to {}: {}",
|
||||||
|
metric_message.hostname, e
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
info!(
|
||||||
|
"✓ Sent initial CollectNow command to {}",
|
||||||
|
metric_message.hostname
|
||||||
|
);
|
||||||
|
self.initial_commands_sent
|
||||||
|
.insert(metric_message.hostname.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update metric store
|
||||||
|
self.metric_store
|
||||||
|
.update_metrics(&metric_message.hostname, metric_message.metrics);
|
||||||
|
|
||||||
|
// Check for agent version mismatches across hosts
|
||||||
|
if let Some((current_version, outdated_hosts)) = self.metric_store.get_version_mismatches() {
|
||||||
|
for outdated_host in &outdated_hosts {
|
||||||
|
warn!("Host {} has outdated agent version (current: {})", outdated_host, current_version);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update TUI with new hosts and metrics (only if not headless)
|
||||||
|
if let Some(ref mut tui_app) = self.tui_app {
|
||||||
|
let connected_hosts = self
|
||||||
|
.metric_store
|
||||||
|
.get_connected_hosts(Duration::from_secs(30));
|
||||||
|
|
||||||
|
|
||||||
|
tui_app.update_hosts(connected_hosts);
|
||||||
|
tui_app.update_metrics(&self.metric_store);
|
||||||
}
|
}
|
||||||
let snapshot = backup_metrics.clone();
|
|
||||||
self.history.record_backup(backup_metrics);
|
|
||||||
state.backup = Some(snapshot);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.status = format!(
|
|
||||||
"Metrics update • host: {} • at {}",
|
|
||||||
host,
|
|
||||||
timestamp.format("%H:%M:%S")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
AppEvent::MetricsFailed {
|
|
||||||
host,
|
|
||||||
error,
|
|
||||||
timestamp,
|
|
||||||
} => {
|
|
||||||
self.zmq_connected = false;
|
|
||||||
self.ensure_host_entry(&host);
|
|
||||||
let state = self.host_states.entry(host.clone()).or_default();
|
|
||||||
state.last_error = Some(format!("{} at {}", error, timestamp.format("%H:%M:%S")));
|
|
||||||
state.connection_status = ConnectionStatus::Error;
|
|
||||||
|
|
||||||
self.status = format!("Fetch failed • host: {} • {}", host, error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_host_timeouts(&mut self) {
|
|
||||||
let now = Utc::now();
|
|
||||||
|
|
||||||
for (_host_name, state) in self.host_states.iter_mut() {
|
|
||||||
if let Some(last_success) = state.last_success {
|
|
||||||
let duration_since_last = now.signed_duration_since(last_success);
|
|
||||||
|
|
||||||
if duration_since_last > chrono::Duration::from_std(HOST_CONNECTION_TIMEOUT).unwrap() {
|
// Also check for command output messages
|
||||||
// Host has timed out (missed keep-alive)
|
if let Ok(Some(cmd_output)) = self.zmq_consumer.receive_command_output().await {
|
||||||
if !matches!(state.connection_status, ConnectionStatus::Timeout) {
|
debug!(
|
||||||
state.connection_status = ConnectionStatus::Timeout;
|
"Received command output from {}: {}",
|
||||||
state.last_error = Some(format!("Keep-alive timeout (no data for {}s)", duration_since_last.num_seconds()));
|
cmd_output.hostname,
|
||||||
|
cmd_output.output_line
|
||||||
|
);
|
||||||
|
|
||||||
|
// Command output (terminal popup removed - output not displayed)
|
||||||
|
}
|
||||||
|
|
||||||
|
last_metrics_check = Instant::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render TUI (only if not headless)
|
||||||
|
if !self.headless {
|
||||||
|
if let Some(ref mut terminal) = self.terminal {
|
||||||
|
if let Some(ref mut tui_app) = self.tui_app {
|
||||||
|
if let Err(e) = terminal.draw(|frame| {
|
||||||
|
tui_app.render(frame, &self.metric_store);
|
||||||
|
}) {
|
||||||
|
error!("Error rendering TUI: {}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Host is connected
|
|
||||||
state.connection_status = ConnectionStatus::Connected;
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// No data ever received from this host
|
|
||||||
state.connection_status = ConnectionStatus::Unknown;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Small sleep to prevent excessive CPU usage
|
||||||
|
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info!("Dashboard main loop ended");
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn help_visible(&self) -> bool {
|
/// Execute a UI command by sending it to the appropriate agent
|
||||||
self.show_help
|
async fn execute_ui_command(&self, command: UiCommand) -> Result<()> {
|
||||||
}
|
match command {
|
||||||
|
UiCommand::ServiceStart { hostname, service_name } => {
|
||||||
fn ensure_host_entry(&mut self, host: &str) {
|
info!("Sending user start command for service {} on {}", service_name, hostname);
|
||||||
if !self.host_states.contains_key(host) {
|
let agent_command = AgentCommand::ServiceControl {
|
||||||
self.host_states
|
service_name: service_name.clone(),
|
||||||
.insert(host.to_string(), HostRuntimeState::default());
|
action: ServiceAction::UserStart,
|
||||||
}
|
|
||||||
|
|
||||||
if self.hosts.iter().any(|entry| entry.name == host) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.hosts.push(HostTarget::from_name(host.to_string()));
|
|
||||||
if self.hosts.len() == 1 {
|
|
||||||
self.active_host_index = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn load_configuration(path: Option<&PathBuf>) -> Result<(Option<AppConfig>, Option<PathBuf>)> {
|
|
||||||
if let Some(explicit) = path {
|
|
||||||
let config = config::load_from_path(explicit)?;
|
|
||||||
return Ok((Some(config), Some(explicit.clone())));
|
|
||||||
}
|
|
||||||
|
|
||||||
let default_path = PathBuf::from("config/dashboard.toml");
|
|
||||||
if default_path.exists() {
|
|
||||||
let config = config::load_from_path(&default_path)?;
|
|
||||||
return Ok((Some(config), Some(default_path)));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((None, None))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_initial_status(host: Option<&String>, config_path: Option<&PathBuf>) -> String {
|
|
||||||
let detected = Self::local_hostname();
|
|
||||||
match (host, config_path, detected.as_ref()) {
|
|
||||||
(Some(host), Some(path), _) => {
|
|
||||||
format!("Ready • host: {} • config: {}", host, path.display())
|
|
||||||
}
|
|
||||||
(Some(host), None, _) => format!("Ready • host: {}", host),
|
|
||||||
(None, Some(path), Some(local)) => format!(
|
|
||||||
"Ready • host: {} (auto) • config: {}",
|
|
||||||
local,
|
|
||||||
path.display()
|
|
||||||
),
|
|
||||||
(None, Some(path), None) => format!("Ready • config: {}", path.display()),
|
|
||||||
(None, None, Some(local)) => format!("Ready • host: {} (auto)", local),
|
|
||||||
(None, None, None) => "Ready • no host selected".to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn select_hosts(host: Option<&String>, _config: Option<&AppConfig>) -> Vec<HostTarget> {
|
|
||||||
let mut targets = Vec::new();
|
|
||||||
|
|
||||||
// Use default hosts for auto-discovery
|
|
||||||
|
|
||||||
if let Some(filter) = host {
|
|
||||||
// If specific host requested, only connect to that one
|
|
||||||
return vec![HostTarget::from_name(filter.clone())];
|
|
||||||
}
|
|
||||||
|
|
||||||
let local_host = Self::local_hostname();
|
|
||||||
|
|
||||||
// Always use auto-discovery - skip config files
|
|
||||||
if let Some(local) = local_host.as_ref() {
|
|
||||||
targets.push(HostTarget::from_name(local.clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add all default hosts for auto-discovery
|
|
||||||
for hostname in DEFAULT_HOSTS {
|
|
||||||
if targets
|
|
||||||
.iter()
|
|
||||||
.any(|existing| existing.name.eq_ignore_ascii_case(hostname))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
targets.push(HostTarget::from_name(hostname.to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
if targets.is_empty() {
|
|
||||||
targets.push(HostTarget::from_name("localhost".to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
targets
|
|
||||||
}
|
|
||||||
|
|
||||||
fn history_capacity_hint(config: Option<&AppConfig>) -> usize {
|
|
||||||
const DEFAULT_CAPACITY: usize = 120;
|
|
||||||
const SAMPLE_SECONDS: u64 = 30;
|
|
||||||
|
|
||||||
let Some(config) = config else {
|
|
||||||
return DEFAULT_CAPACITY;
|
|
||||||
};
|
|
||||||
|
|
||||||
let minutes = config.dashboard.history_duration_minutes.max(1);
|
|
||||||
let total_seconds = minutes.saturating_mul(60);
|
|
||||||
let samples = total_seconds / SAMPLE_SECONDS;
|
|
||||||
usize::try_from(samples.max(1)).unwrap_or(DEFAULT_CAPACITY)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn connected_hosts(&self) -> Vec<&HostTarget> {
|
|
||||||
self.hosts
|
|
||||||
.iter()
|
|
||||||
.filter(|host| {
|
|
||||||
self.host_states
|
|
||||||
.get(&host.name)
|
|
||||||
.map(|state| state.last_success.is_some())
|
|
||||||
.unwrap_or(false)
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn select_previous_host(&mut self) {
|
|
||||||
let connected = self.connected_hosts();
|
|
||||||
if connected.is_empty() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find current host in connected list
|
|
||||||
let current_host = self.hosts.get(self.active_host_index);
|
|
||||||
if let Some(current) = current_host {
|
|
||||||
if let Some(current_pos) = connected.iter().position(|h| h.name == current.name) {
|
|
||||||
let new_pos = if current_pos == 0 {
|
|
||||||
connected.len().saturating_sub(1)
|
|
||||||
} else {
|
|
||||||
current_pos - 1
|
|
||||||
};
|
};
|
||||||
let new_host = connected[new_pos];
|
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||||
// Find this host's index in the full hosts list
|
}
|
||||||
if let Some(new_index) = self.hosts.iter().position(|h| h.name == new_host.name) {
|
UiCommand::ServiceStop { hostname, service_name } => {
|
||||||
self.active_host_index = new_index;
|
info!("Sending user stop command for service {} on {}", service_name, hostname);
|
||||||
}
|
let agent_command = AgentCommand::ServiceControl {
|
||||||
} else {
|
service_name: service_name.clone(),
|
||||||
// Current host not connected, switch to first connected host
|
action: ServiceAction::UserStop,
|
||||||
if let Some(new_index) = self.hosts.iter().position(|h| h.name == connected[0].name) {
|
};
|
||||||
self.active_host_index = new_index;
|
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||||
}
|
}
|
||||||
|
UiCommand::TriggerBackup { hostname } => {
|
||||||
|
info!("Trigger backup requested for {}", hostname);
|
||||||
|
// TODO: Implement backup trigger command
|
||||||
|
info!("Backup trigger not yet implemented");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
self.status = format!(
|
|
||||||
"Active host switched to {} ({}/{})",
|
|
||||||
self.hosts[self.active_host_index].name,
|
|
||||||
self.active_host_index + 1,
|
|
||||||
self.hosts.len()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn select_next_host(&mut self) {
|
}
|
||||||
let connected = self.connected_hosts();
|
|
||||||
if connected.is_empty() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find current host in connected list
|
impl Drop for Dashboard {
|
||||||
let current_host = self.hosts.get(self.active_host_index);
|
fn drop(&mut self) {
|
||||||
if let Some(current) = current_host {
|
// Restore terminal (only if not headless)
|
||||||
if let Some(current_pos) = connected.iter().position(|h| h.name == current.name) {
|
if !self.headless {
|
||||||
let new_pos = (current_pos + 1) % connected.len();
|
let _ = disable_raw_mode();
|
||||||
let new_host = connected[new_pos];
|
if let Some(ref mut terminal) = self.terminal {
|
||||||
// Find this host's index in the full hosts list
|
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
|
||||||
if let Some(new_index) = self.hosts.iter().position(|h| h.name == new_host.name) {
|
let _ = terminal.show_cursor();
|
||||||
self.active_host_index = new_index;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Current host not connected, switch to first connected host
|
|
||||||
if let Some(new_index) = self.hosts.iter().position(|h| h.name == connected[0].name) {
|
|
||||||
self.active_host_index = new_index;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.status = format!(
|
|
||||||
"Active host switched to {} ({}/{})",
|
|
||||||
self.hosts[self.active_host_index].name,
|
|
||||||
self.active_host_index + 1,
|
|
||||||
self.hosts.len()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn resolve_zmq_config(config: Option<&AppConfig>) -> (Vec<String>, Option<String>) {
|
|
||||||
let default = ZmqConfig::default();
|
|
||||||
let zmq_config = config
|
|
||||||
.and_then(|cfg| {
|
|
||||||
if cfg.data_source.kind == DataSourceKind::Zmq {
|
|
||||||
Some(cfg.data_source.zmq.clone())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.unwrap_or(default);
|
|
||||||
|
|
||||||
let endpoints = if zmq_config.endpoints.is_empty() {
|
|
||||||
// Generate endpoints for all default hosts
|
|
||||||
let mut endpoints = Vec::new();
|
|
||||||
|
|
||||||
// Always include localhost
|
|
||||||
endpoints.push("tcp://127.0.0.1:6130".to_string());
|
|
||||||
|
|
||||||
// Add endpoint for each default host
|
|
||||||
for host in DEFAULT_HOSTS {
|
|
||||||
endpoints.push(format!("tcp://{}:6130", host));
|
|
||||||
}
|
|
||||||
|
|
||||||
endpoints
|
|
||||||
} else {
|
|
||||||
zmq_config.endpoints.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
(endpoints, zmq_config.subscribe.clone())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl App {
|
|
||||||
fn local_hostname() -> Option<String> {
|
|
||||||
let raw = gethostname();
|
|
||||||
let value = raw.to_string_lossy().trim().to_string();
|
|
||||||
if value.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct HostDisplayData {
|
|
||||||
pub name: String,
|
|
||||||
pub last_success: Option<DateTime<Utc>>,
|
|
||||||
pub last_error: Option<String>,
|
|
||||||
pub connection_status: ConnectionStatus,
|
|
||||||
pub smart: Option<SmartMetrics>,
|
|
||||||
pub services: Option<ServiceMetrics>,
|
|
||||||
pub system: Option<SystemMetrics>,
|
|
||||||
pub backup: Option<BackupMetrics>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct ZmqContext {
|
|
||||||
endpoints: Vec<String>,
|
|
||||||
subscription: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ZmqContext {
|
|
||||||
pub fn new(endpoints: Vec<String>, subscription: Option<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
endpoints,
|
|
||||||
subscription,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn endpoints(&self) -> &[String] {
|
|
||||||
&self.endpoints
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn subscription(&self) -> Option<&str> {
|
|
||||||
self.subscription.as_deref()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub enum AppEvent {
|
|
||||||
MetricsUpdated {
|
|
||||||
host: String,
|
|
||||||
smart: Option<SmartMetrics>,
|
|
||||||
services: Option<ServiceMetrics>,
|
|
||||||
system: Option<SystemMetrics>,
|
|
||||||
backup: Option<BackupMetrics>,
|
|
||||||
timestamp: DateTime<Utc>,
|
|
||||||
},
|
|
||||||
MetricsFailed {
|
|
||||||
host: String,
|
|
||||||
error: String,
|
|
||||||
timestamp: DateTime<Utc>,
|
|
||||||
},
|
|
||||||
Shutdown,
|
|
||||||
}
|
|
||||||
|
|||||||
233
dashboard/src/communication/mod.rs
Normal file
233
dashboard/src/communication/mod.rs
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use cm_dashboard_shared::{CommandOutputMessage, MessageEnvelope, MessageType, MetricMessage};
|
||||||
|
use tracing::{debug, error, info, warn};
|
||||||
|
use zmq::{Context, Socket, SocketType};
|
||||||
|
|
||||||
|
use crate::config::ZmqConfig;
|
||||||
|
|
||||||
|
/// Commands that can be sent to agents
|
||||||
|
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub enum AgentCommand {
|
||||||
|
/// Request immediate metric collection
|
||||||
|
CollectNow,
|
||||||
|
/// Change collection interval
|
||||||
|
SetInterval { seconds: u64 },
|
||||||
|
/// Enable/disable a collector
|
||||||
|
ToggleCollector { name: String, enabled: bool },
|
||||||
|
/// Request status/health check
|
||||||
|
Ping,
|
||||||
|
/// Control systemd service
|
||||||
|
ServiceControl {
|
||||||
|
service_name: String,
|
||||||
|
action: ServiceAction,
|
||||||
|
},
|
||||||
|
/// Rebuild NixOS system
|
||||||
|
SystemRebuild {
|
||||||
|
git_url: String,
|
||||||
|
git_branch: String,
|
||||||
|
working_dir: String,
|
||||||
|
api_key_file: Option<String>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service control actions
|
||||||
|
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub enum ServiceAction {
|
||||||
|
Start,
|
||||||
|
Stop,
|
||||||
|
Status,
|
||||||
|
UserStart, // User-initiated start (clears user-stopped flag)
|
||||||
|
UserStop, // User-initiated stop (marks as user-stopped)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ZMQ consumer for receiving metrics from agents
|
||||||
|
pub struct ZmqConsumer {
|
||||||
|
subscriber: Socket,
|
||||||
|
config: ZmqConfig,
|
||||||
|
connected_hosts: std::collections::HashSet<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ZmqConsumer {
|
||||||
|
pub async fn new(config: &ZmqConfig) -> Result<Self> {
|
||||||
|
let context = Context::new();
|
||||||
|
|
||||||
|
// Create subscriber socket
|
||||||
|
let subscriber = context.socket(SocketType::SUB)?;
|
||||||
|
|
||||||
|
// Set socket options
|
||||||
|
subscriber.set_rcvtimeo(1000)?; // 1 second timeout for non-blocking receives
|
||||||
|
subscriber.set_subscribe(b"")?; // Subscribe to all messages
|
||||||
|
|
||||||
|
info!("ZMQ consumer initialized");
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
subscriber,
|
||||||
|
config: config.clone(),
|
||||||
|
connected_hosts: std::collections::HashSet::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Connect to a specific host's agent
|
||||||
|
pub async fn connect_to_host(&mut self, hostname: &str, port: u16) -> Result<()> {
|
||||||
|
let address = format!("tcp://{}:{}", hostname, port);
|
||||||
|
|
||||||
|
match self.subscriber.connect(&address) {
|
||||||
|
Ok(()) => {
|
||||||
|
info!("Connected to agent at {}", address);
|
||||||
|
self.connected_hosts.insert(hostname.to_string());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to connect to agent at {}: {}", address, e);
|
||||||
|
Err(anyhow::anyhow!("Failed to connect to {}: {}", address, e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Connect to predefined hosts
|
||||||
|
pub async fn connect_to_predefined_hosts(&mut self, hosts: &[String]) -> Result<()> {
|
||||||
|
let default_port = self.config.subscriber_ports[0];
|
||||||
|
|
||||||
|
for hostname in hosts {
|
||||||
|
// Try to connect, but don't fail if some hosts are unreachable
|
||||||
|
if let Err(e) = self.connect_to_host(hostname, default_port).await {
|
||||||
|
warn!("Could not connect to {}: {}", hostname, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Connected to {} out of {} configured hosts",
|
||||||
|
self.connected_hosts.len(),
|
||||||
|
hosts.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Receive command output from any connected agent (non-blocking)
|
||||||
|
pub async fn receive_command_output(&mut self) -> Result<Option<CommandOutputMessage>> {
|
||||||
|
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||||
|
Ok(data) => {
|
||||||
|
// Deserialize envelope
|
||||||
|
let envelope: MessageEnvelope = serde_json::from_slice(&data)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to deserialize envelope: {}", e))?;
|
||||||
|
|
||||||
|
// Check message type
|
||||||
|
match envelope.message_type {
|
||||||
|
MessageType::CommandOutput => {
|
||||||
|
let cmd_output = envelope
|
||||||
|
.decode_command_output()
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to decode command output: {}", e))?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Received command output from {}: {}",
|
||||||
|
cmd_output.hostname,
|
||||||
|
cmd_output.output_line
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Some(cmd_output))
|
||||||
|
}
|
||||||
|
_ => Ok(None), // Not a command output message
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(zmq::Error::EAGAIN) => {
|
||||||
|
// No message available (non-blocking mode)
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("ZMQ receive error: {}", e);
|
||||||
|
Err(anyhow::anyhow!("ZMQ receive error: {}", e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Receive metrics from any connected agent (non-blocking)
|
||||||
|
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
||||||
|
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||||
|
Ok(data) => {
|
||||||
|
debug!("Received {} bytes from ZMQ", data.len());
|
||||||
|
|
||||||
|
// Deserialize envelope
|
||||||
|
let envelope: MessageEnvelope = serde_json::from_slice(&data)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to deserialize envelope: {}", e))?;
|
||||||
|
|
||||||
|
// Check message type
|
||||||
|
match envelope.message_type {
|
||||||
|
MessageType::Metrics => {
|
||||||
|
let metrics = envelope
|
||||||
|
.decode_metrics()
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to decode metrics: {}", e))?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Received {} metrics from {}",
|
||||||
|
metrics.metrics.len(),
|
||||||
|
metrics.hostname
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Some(metrics))
|
||||||
|
}
|
||||||
|
MessageType::Heartbeat => {
|
||||||
|
debug!("Received heartbeat");
|
||||||
|
Ok(None) // Don't return heartbeats as metrics
|
||||||
|
}
|
||||||
|
MessageType::CommandOutput => {
|
||||||
|
debug!("Received command output (will be handled by receive_command_output)");
|
||||||
|
Ok(None) // Command output handled by separate method
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
debug!("Received non-metrics message: {:?}", envelope.message_type);
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(zmq::Error::EAGAIN) => {
|
||||||
|
// No message available (non-blocking mode)
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("ZMQ receive error: {}", e);
|
||||||
|
Err(anyhow::anyhow!("ZMQ receive error: {}", e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ZMQ command sender for sending commands to agents
|
||||||
|
pub struct ZmqCommandSender {
|
||||||
|
context: Context,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ZmqCommandSender {
|
||||||
|
pub fn new(_config: &ZmqConfig) -> Result<Self> {
|
||||||
|
let context = Context::new();
|
||||||
|
|
||||||
|
info!("ZMQ command sender initialized");
|
||||||
|
|
||||||
|
Ok(Self { context })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send a command to a specific agent
|
||||||
|
pub async fn send_command(&self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||||
|
// Create a new PUSH socket for this command (ZMQ best practice)
|
||||||
|
let socket = self.context.socket(SocketType::PUSH)?;
|
||||||
|
|
||||||
|
// Set socket options
|
||||||
|
socket.set_linger(1000)?; // Wait up to 1 second on close
|
||||||
|
socket.set_sndtimeo(5000)?; // 5 second send timeout
|
||||||
|
|
||||||
|
// Connect to agent's command port (6131)
|
||||||
|
let address = format!("tcp://{}:6131", hostname);
|
||||||
|
socket.connect(&address)?;
|
||||||
|
|
||||||
|
// Serialize command
|
||||||
|
let serialized = serde_json::to_vec(&command)?;
|
||||||
|
|
||||||
|
// Send command
|
||||||
|
socket.send(&serialized, 0)?;
|
||||||
|
|
||||||
|
info!("Sent command {:?} to agent at {}", command, hostname);
|
||||||
|
|
||||||
|
// Socket will be automatically closed when dropped
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
#![allow(dead_code)]
|
|
||||||
|
|
||||||
use std::fs;
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
|
||||||
|
|
||||||
use crate::data::config::AppConfig;
|
|
||||||
|
|
||||||
/// Load application configuration from a TOML file.
|
|
||||||
pub fn load_from_path(path: &Path) -> Result<AppConfig> {
|
|
||||||
let raw = fs::read_to_string(path)
|
|
||||||
.with_context(|| format!("failed to read configuration file at {}", path.display()))?;
|
|
||||||
|
|
||||||
let config = toml::from_str::<AppConfig>(&raw)
|
|
||||||
.with_context(|| format!("failed to parse configuration file {}", path.display()))?;
|
|
||||||
|
|
||||||
Ok(config)
|
|
||||||
}
|
|
||||||
75
dashboard/src/config/mod.rs
Normal file
75
dashboard/src/config/mod.rs
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
/// Main dashboard configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DashboardConfig {
|
||||||
|
pub zmq: ZmqConfig,
|
||||||
|
pub hosts: HostsConfig,
|
||||||
|
pub system: SystemConfig,
|
||||||
|
pub ssh: SshConfig,
|
||||||
|
pub service_logs: std::collections::HashMap<String, Vec<ServiceLogConfig>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ZMQ consumer configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ZmqConfig {
|
||||||
|
pub subscriber_ports: Vec<u16>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hosts configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HostsConfig {
|
||||||
|
pub predefined_hosts: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// System configuration
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SystemConfig {
|
||||||
|
pub nixos_config_git_url: String,
|
||||||
|
pub nixos_config_branch: String,
|
||||||
|
pub nixos_config_working_dir: String,
|
||||||
|
pub nixos_config_api_key_file: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SSH configuration for rebuild operations
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SshConfig {
|
||||||
|
pub rebuild_user: String,
|
||||||
|
pub rebuild_alias: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service log file configuration per host
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ServiceLogConfig {
|
||||||
|
pub service_name: String,
|
||||||
|
pub log_file_path: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DashboardConfig {
|
||||||
|
pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
let content = std::fs::read_to_string(path)?;
|
||||||
|
let config: DashboardConfig = toml::from_str(&content)?;
|
||||||
|
Ok(config)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DashboardConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
panic!("Dashboard configuration must be loaded from file - no hardcoded defaults allowed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ZmqConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
panic!("Dashboard configuration must be loaded from file - no hardcoded defaults allowed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for HostsConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
panic!("Dashboard configuration must be loaded from file - no hardcoded defaults allowed")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,150 +0,0 @@
|
|||||||
#![allow(dead_code)]
|
|
||||||
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
use serde::Deserialize;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct HostsConfig {
|
|
||||||
pub default_host: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub hosts: Vec<HostTarget>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct HostTarget {
|
|
||||||
pub name: String,
|
|
||||||
#[serde(default = "default_true")]
|
|
||||||
pub enabled: bool,
|
|
||||||
#[serde(default)]
|
|
||||||
pub metadata: HashMap<String, String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl HostTarget {
|
|
||||||
pub fn from_name(name: String) -> Self {
|
|
||||||
Self {
|
|
||||||
name,
|
|
||||||
enabled: true,
|
|
||||||
metadata: HashMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct DashboardConfig {
|
|
||||||
#[serde(default = "default_tick_rate_ms")]
|
|
||||||
pub tick_rate_ms: u64,
|
|
||||||
#[serde(default)]
|
|
||||||
pub history_duration_minutes: u64,
|
|
||||||
#[serde(default)]
|
|
||||||
pub widgets: Vec<WidgetConfig>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for DashboardConfig {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
tick_rate_ms: default_tick_rate_ms(),
|
|
||||||
history_duration_minutes: 60,
|
|
||||||
widgets: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct WidgetConfig {
|
|
||||||
pub id: String,
|
|
||||||
#[serde(default)]
|
|
||||||
pub enabled: bool,
|
|
||||||
#[serde(default)]
|
|
||||||
pub options: HashMap<String, String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct AppFilesystem {
|
|
||||||
pub cache_dir: Option<PathBuf>,
|
|
||||||
pub history_dir: Option<PathBuf>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct AppConfig {
|
|
||||||
pub hosts: HostsConfig,
|
|
||||||
#[serde(default)]
|
|
||||||
pub dashboard: DashboardConfig,
|
|
||||||
#[serde(default = "default_data_source_config")]
|
|
||||||
pub data_source: DataSourceConfig,
|
|
||||||
#[serde(default)]
|
|
||||||
pub filesystem: Option<AppFilesystem>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct DataSourceConfig {
|
|
||||||
#[serde(default = "default_data_source_kind")]
|
|
||||||
pub kind: DataSourceKind,
|
|
||||||
#[serde(default)]
|
|
||||||
pub zmq: ZmqConfig,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for DataSourceConfig {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
kind: DataSourceKind::Zmq,
|
|
||||||
zmq: ZmqConfig::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
|
|
||||||
#[serde(rename_all = "snake_case")]
|
|
||||||
pub enum DataSourceKind {
|
|
||||||
Zmq,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn default_data_source_kind() -> DataSourceKind {
|
|
||||||
DataSourceKind::Zmq
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct ZmqConfig {
|
|
||||||
#[serde(default = "default_zmq_endpoints")]
|
|
||||||
pub endpoints: Vec<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub subscribe: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for ZmqConfig {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
endpoints: default_zmq_endpoints(),
|
|
||||||
subscribe: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const fn default_true() -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
const fn default_tick_rate_ms() -> u64 {
|
|
||||||
500
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Default hosts for auto-discovery
|
|
||||||
pub const DEFAULT_HOSTS: &[&str] = &[
|
|
||||||
"cmbox", "labbox", "simonbox", "steambox", "srv01"
|
|
||||||
];
|
|
||||||
|
|
||||||
fn default_data_source_config() -> DataSourceConfig {
|
|
||||||
DataSourceConfig::default()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn default_zmq_endpoints() -> Vec<String> {
|
|
||||||
// Default endpoints include localhost and all known CMTEC hosts
|
|
||||||
let mut endpoints = vec!["tcp://127.0.0.1:6130".to_string()];
|
|
||||||
|
|
||||||
for host in DEFAULT_HOSTS {
|
|
||||||
endpoints.push(format!("tcp://{}:6130", host));
|
|
||||||
}
|
|
||||||
|
|
||||||
endpoints
|
|
||||||
}
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
#![allow(dead_code)]
|
|
||||||
|
|
||||||
use std::collections::VecDeque;
|
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
|
||||||
|
|
||||||
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics};
|
|
||||||
|
|
||||||
/// Ring buffer for retaining recent samples for trend analysis.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct MetricsHistory {
|
|
||||||
capacity: usize,
|
|
||||||
smart: VecDeque<(DateTime<Utc>, SmartMetrics)>,
|
|
||||||
services: VecDeque<(DateTime<Utc>, ServiceMetrics)>,
|
|
||||||
system: VecDeque<(DateTime<Utc>, SystemMetrics)>,
|
|
||||||
backups: VecDeque<(DateTime<Utc>, BackupMetrics)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MetricsHistory {
|
|
||||||
pub fn with_capacity(capacity: usize) -> Self {
|
|
||||||
Self {
|
|
||||||
capacity,
|
|
||||||
smart: VecDeque::with_capacity(capacity),
|
|
||||||
services: VecDeque::with_capacity(capacity),
|
|
||||||
system: VecDeque::with_capacity(capacity),
|
|
||||||
backups: VecDeque::with_capacity(capacity),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn record_smart(&mut self, metrics: SmartMetrics) {
|
|
||||||
let entry = (Utc::now(), metrics);
|
|
||||||
Self::push_with_limit(&mut self.smart, entry, self.capacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn record_services(&mut self, metrics: ServiceMetrics) {
|
|
||||||
let entry = (Utc::now(), metrics);
|
|
||||||
Self::push_with_limit(&mut self.services, entry, self.capacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn record_system(&mut self, metrics: SystemMetrics) {
|
|
||||||
let entry = (Utc::now(), metrics);
|
|
||||||
Self::push_with_limit(&mut self.system, entry, self.capacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn record_backup(&mut self, metrics: BackupMetrics) {
|
|
||||||
let entry = (Utc::now(), metrics);
|
|
||||||
Self::push_with_limit(&mut self.backups, entry, self.capacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn retention(&self) -> Duration {
|
|
||||||
Duration::from_secs((self.capacity as u64) * 30)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn push_with_limit<T>(deque: &mut VecDeque<T>, item: T, capacity: usize) {
|
|
||||||
if deque.len() == capacity {
|
|
||||||
deque.pop_front();
|
|
||||||
}
|
|
||||||
deque.push_back(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,189 +0,0 @@
|
|||||||
#![allow(dead_code)]
|
|
||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct SmartMetrics {
|
|
||||||
pub status: String,
|
|
||||||
pub drives: Vec<DriveInfo>,
|
|
||||||
pub summary: DriveSummary,
|
|
||||||
pub issues: Vec<String>,
|
|
||||||
pub timestamp: DateTime<Utc>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct DriveInfo {
|
|
||||||
pub name: String,
|
|
||||||
pub temperature_c: f32,
|
|
||||||
pub wear_level: f32,
|
|
||||||
pub power_on_hours: u64,
|
|
||||||
pub available_spare: f32,
|
|
||||||
pub capacity_gb: Option<f32>,
|
|
||||||
pub used_gb: Option<f32>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub description: Option<Vec<String>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct DriveSummary {
|
|
||||||
pub healthy: usize,
|
|
||||||
pub warning: usize,
|
|
||||||
pub critical: usize,
|
|
||||||
pub capacity_total_gb: f32,
|
|
||||||
pub capacity_used_gb: f32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct SystemMetrics {
|
|
||||||
pub summary: SystemSummary,
|
|
||||||
pub timestamp: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct SystemSummary {
|
|
||||||
pub cpu_load_1: f32,
|
|
||||||
pub cpu_load_5: f32,
|
|
||||||
pub cpu_load_15: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_status: Option<String>,
|
|
||||||
pub memory_used_mb: f32,
|
|
||||||
pub memory_total_mb: f32,
|
|
||||||
pub memory_usage_percent: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub memory_status: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_temp_c: Option<f32>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_temp_status: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_cstate: Option<Vec<String>>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub logged_in_users: Option<Vec<String>>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub top_cpu_process: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub top_ram_process: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct ServiceMetrics {
|
|
||||||
pub summary: ServiceSummary,
|
|
||||||
pub services: Vec<ServiceInfo>,
|
|
||||||
pub timestamp: DateTime<Utc>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct ServiceSummary {
|
|
||||||
pub healthy: usize,
|
|
||||||
pub degraded: usize,
|
|
||||||
pub failed: usize,
|
|
||||||
#[serde(default)]
|
|
||||||
pub services_status: Option<String>,
|
|
||||||
pub memory_used_mb: f32,
|
|
||||||
pub memory_quota_mb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub system_memory_used_mb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub system_memory_total_mb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub memory_status: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub disk_used_gb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub disk_total_gb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_load_1: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_load_5: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_load_15: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_status: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_cstate: Option<Vec<String>>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_temp_c: Option<f32>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub cpu_temp_status: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub gpu_load_percent: Option<f32>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub gpu_temp_c: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct ServiceInfo {
|
|
||||||
pub name: String,
|
|
||||||
pub status: ServiceStatus,
|
|
||||||
pub memory_used_mb: f32,
|
|
||||||
pub memory_quota_mb: f32,
|
|
||||||
pub cpu_percent: f32,
|
|
||||||
pub sandbox_limit: Option<f32>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub disk_used_gb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub disk_quota_gb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub is_sandboxed: bool,
|
|
||||||
#[serde(default)]
|
|
||||||
pub is_sandbox_excluded: bool,
|
|
||||||
#[serde(default)]
|
|
||||||
pub description: Option<Vec<String>>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub sub_service: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
pub latency_ms: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub enum ServiceStatus {
|
|
||||||
Running,
|
|
||||||
Degraded,
|
|
||||||
Restarting,
|
|
||||||
Stopped,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct BackupMetrics {
|
|
||||||
pub overall_status: String,
|
|
||||||
pub backup: BackupInfo,
|
|
||||||
pub service: BackupServiceInfo,
|
|
||||||
#[serde(default)]
|
|
||||||
pub disk: Option<BackupDiskInfo>,
|
|
||||||
pub timestamp: DateTime<Utc>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct BackupInfo {
|
|
||||||
pub last_success: Option<DateTime<Utc>>,
|
|
||||||
pub last_failure: Option<DateTime<Utc>>,
|
|
||||||
pub size_gb: f32,
|
|
||||||
#[serde(default)]
|
|
||||||
pub latest_archive_size_gb: Option<f32>,
|
|
||||||
pub snapshot_count: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct BackupServiceInfo {
|
|
||||||
pub enabled: bool,
|
|
||||||
pub pending_jobs: u32,
|
|
||||||
pub last_message: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct BackupDiskInfo {
|
|
||||||
pub device: String,
|
|
||||||
pub health: String,
|
|
||||||
pub total_gb: f32,
|
|
||||||
pub used_gb: f32,
|
|
||||||
pub usage_percent: f32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub enum BackupStatus {
|
|
||||||
Healthy,
|
|
||||||
Warning,
|
|
||||||
Failed,
|
|
||||||
Unknown,
|
|
||||||
}
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
pub mod config;
|
|
||||||
pub mod history;
|
|
||||||
pub mod metrics;
|
|
||||||
@@ -1,547 +1,115 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use clap::Parser;
|
||||||
|
use std::process;
|
||||||
|
use tracing::{error, info};
|
||||||
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
mod app;
|
mod app;
|
||||||
|
mod communication;
|
||||||
mod config;
|
mod config;
|
||||||
mod data;
|
mod metrics;
|
||||||
mod ui;
|
mod ui;
|
||||||
|
|
||||||
use std::fs;
|
use app::Dashboard;
|
||||||
use std::io::{self, Stdout};
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use std::sync::{
|
|
||||||
atomic::{AtomicBool, Ordering},
|
|
||||||
Arc, OnceLock,
|
|
||||||
};
|
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics};
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
|
||||||
use chrono::{TimeZone, Utc};
|
|
||||||
use clap::{ArgAction, Parser, Subcommand};
|
|
||||||
use cm_dashboard_shared::envelope::{AgentType, MetricsEnvelope};
|
|
||||||
use crossterm::event::{self, Event};
|
|
||||||
use crossterm::terminal::{disable_raw_mode, enable_raw_mode};
|
|
||||||
use crossterm::{execute, terminal};
|
|
||||||
use ratatui::backend::CrosstermBackend;
|
|
||||||
use ratatui::Terminal;
|
|
||||||
use serde_json::Value;
|
|
||||||
use tokio::sync::mpsc::{
|
|
||||||
error::TryRecvError, unbounded_channel, UnboundedReceiver, UnboundedSender,
|
|
||||||
};
|
|
||||||
use tokio::task::{spawn_blocking, JoinHandle};
|
|
||||||
use tracing::{debug, warn};
|
|
||||||
use tracing_appender::non_blocking::WorkerGuard;
|
|
||||||
use tracing_subscriber::EnvFilter;
|
|
||||||
use zmq::{Context as NativeZmqContext, Message as NativeZmqMessage};
|
|
||||||
|
|
||||||
use crate::app::{App, AppEvent, AppOptions, ZmqContext};
|
/// Check if running inside tmux session
|
||||||
|
fn check_tmux_session() {
|
||||||
static LOG_GUARD: OnceLock<WorkerGuard> = OnceLock::new();
|
// Check for TMUX environment variable which is set when inside a tmux session
|
||||||
|
if std::env::var("TMUX").is_err() {
|
||||||
#[derive(Parser, Debug)]
|
eprintln!("╭─────────────────────────────────────────────────────────────╮");
|
||||||
#[command(
|
eprintln!("│ ⚠️ TMUX REQUIRED │");
|
||||||
name = "cm-dashboard",
|
eprintln!("├─────────────────────────────────────────────────────────────┤");
|
||||||
version,
|
eprintln!("│ CM Dashboard must be run inside a tmux session for proper │");
|
||||||
about = "Infrastructure monitoring TUI for CMTEC"
|
eprintln!("│ terminal handling and remote operation functionality. │");
|
||||||
)]
|
eprintln!("│ │");
|
||||||
struct Cli {
|
eprintln!("│ Please start a tmux session first: │");
|
||||||
#[command(subcommand)]
|
eprintln!("│ tmux new-session -d -s dashboard cm-dashboard │");
|
||||||
command: Option<Command>,
|
eprintln!("│ tmux attach-session -t dashboard │");
|
||||||
/// Optional path to configuration TOML file
|
eprintln!("│ │");
|
||||||
#[arg(long, value_name = "FILE")]
|
eprintln!("│ Or simply: │");
|
||||||
config: Option<PathBuf>,
|
eprintln!("│ tmux │");
|
||||||
|
eprintln!("│ cm-dashboard │");
|
||||||
/// Limit dashboard to a single host
|
eprintln!("╰─────────────────────────────────────────────────────────────╯");
|
||||||
#[arg(short = 'H', long, value_name = "HOST")]
|
process::exit(1);
|
||||||
host: Option<String>,
|
}
|
||||||
|
|
||||||
/// Interval (ms) to refresh dashboard when idle
|
|
||||||
#[arg(long, default_value_t = 250)]
|
|
||||||
tick_rate: u64,
|
|
||||||
|
|
||||||
/// Increase logging verbosity (-v, -vv)
|
|
||||||
#[arg(short, long, action = ArgAction::Count)]
|
|
||||||
verbose: u8,
|
|
||||||
|
|
||||||
/// Override ZMQ endpoints (comma-separated)
|
|
||||||
#[arg(long, value_delimiter = ',', value_name = "ENDPOINT")]
|
|
||||||
zmq_endpoint: Vec<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Subcommand, Debug)]
|
#[derive(Parser)]
|
||||||
enum Command {
|
#[command(name = "cm-dashboard")]
|
||||||
/// Generate default configuration files
|
#[command(about = "CM Dashboard TUI with individual metric consumption")]
|
||||||
InitConfig {
|
#[command(version)]
|
||||||
#[arg(long, value_name = "DIR", default_value = "config")]
|
struct Cli {
|
||||||
dir: PathBuf,
|
/// Increase logging verbosity (-v, -vv)
|
||||||
/// Overwrite existing files if they already exist
|
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||||
#[arg(long, action = ArgAction::SetTrue)]
|
verbose: u8,
|
||||||
force: bool,
|
|
||||||
},
|
/// Configuration file path (defaults to /etc/cm-dashboard/dashboard.toml)
|
||||||
|
#[arg(short, long)]
|
||||||
|
config: Option<String>,
|
||||||
|
|
||||||
|
/// Run in headless mode (no TUI, just logging)
|
||||||
|
#[arg(long)]
|
||||||
|
headless: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
|
|
||||||
if let Some(Command::InitConfig { dir, force }) = cli.command.as_ref() {
|
// Setup logging - only if headless or verbose
|
||||||
init_tracing(cli.verbose)?;
|
if cli.headless || cli.verbose > 0 {
|
||||||
generate_config_templates(dir, *force)?;
|
let log_level = match cli.verbose {
|
||||||
return Ok(());
|
0 => "warn", // Only warnings and errors when not verbose
|
||||||
|
1 => "info",
|
||||||
|
2 => "debug",
|
||||||
|
_ => "trace",
|
||||||
|
};
|
||||||
|
|
||||||
|
tracing_subscriber::fmt()
|
||||||
|
.with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
|
||||||
|
.init();
|
||||||
|
} else {
|
||||||
|
// No logging output when running TUI mode
|
||||||
|
tracing_subscriber::fmt()
|
||||||
|
.with_env_filter(EnvFilter::from_default_env().add_directive("off".parse()?))
|
||||||
|
.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
ensure_default_config(&cli)?;
|
// Check for tmux session requirement (only for TUI mode)
|
||||||
|
if !cli.headless {
|
||||||
|
check_tmux_session();
|
||||||
|
}
|
||||||
|
|
||||||
let options = AppOptions {
|
if cli.headless || cli.verbose > 0 {
|
||||||
config: cli.config,
|
info!("CM Dashboard starting with individual metrics architecture...");
|
||||||
host: cli.host,
|
}
|
||||||
tick_rate: Duration::from_millis(cli.tick_rate.max(16)),
|
|
||||||
verbosity: cli.verbose,
|
// Create and run dashboard
|
||||||
zmq_endpoints_override: cli.zmq_endpoint,
|
let mut dashboard = Dashboard::new(cli.config, cli.headless).await?;
|
||||||
|
|
||||||
|
// Setup graceful shutdown
|
||||||
|
let ctrl_c = async {
|
||||||
|
tokio::signal::ctrl_c()
|
||||||
|
.await
|
||||||
|
.expect("failed to install Ctrl+C handler");
|
||||||
};
|
};
|
||||||
|
|
||||||
init_tracing(options.verbosity)?;
|
// Run dashboard with graceful shutdown
|
||||||
|
tokio::select! {
|
||||||
let mut app = App::new(options)?;
|
result = dashboard.run() => {
|
||||||
let (event_tx, mut event_rx) = unbounded_channel();
|
if let Err(e) = result {
|
||||||
|
error!("Dashboard error: {}", e);
|
||||||
let shutdown_flag = Arc::new(AtomicBool::new(false));
|
return Err(e);
|
||||||
|
|
||||||
let zmq_task = if let Some(context) = app.zmq_context() {
|
|
||||||
Some(spawn_metrics_task(
|
|
||||||
context,
|
|
||||||
event_tx.clone(),
|
|
||||||
shutdown_flag.clone(),
|
|
||||||
))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut terminal = setup_terminal()?;
|
|
||||||
let result = run_app(&mut terminal, &mut app, &mut event_rx);
|
|
||||||
teardown_terminal(terminal)?;
|
|
||||||
shutdown_flag.store(true, Ordering::Relaxed);
|
|
||||||
let _ = event_tx.send(AppEvent::Shutdown);
|
|
||||||
if let Some(handle) = zmq_task {
|
|
||||||
if let Err(join_error) = handle.await {
|
|
||||||
warn!(%join_error, "ZMQ metrics task ended unexpectedly");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|
||||||
fn setup_terminal() -> Result<Terminal<CrosstermBackend<Stdout>>> {
|
|
||||||
enable_raw_mode()?;
|
|
||||||
let mut stdout = io::stdout();
|
|
||||||
execute!(stdout, terminal::EnterAlternateScreen)?;
|
|
||||||
let backend = CrosstermBackend::new(stdout);
|
|
||||||
let terminal = Terminal::new(backend)?;
|
|
||||||
Ok(terminal)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn teardown_terminal(mut terminal: Terminal<CrosstermBackend<Stdout>>) -> Result<()> {
|
|
||||||
disable_raw_mode()?;
|
|
||||||
execute!(terminal.backend_mut(), terminal::LeaveAlternateScreen)?;
|
|
||||||
terminal.show_cursor()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn run_app(
|
|
||||||
terminal: &mut Terminal<CrosstermBackend<Stdout>>,
|
|
||||||
app: &mut App,
|
|
||||||
event_rx: &mut UnboundedReceiver<AppEvent>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let tick_rate = app.tick_rate();
|
|
||||||
|
|
||||||
while !app.should_quit() {
|
|
||||||
drain_app_events(app, event_rx);
|
|
||||||
terminal.draw(|frame| ui::render(frame, app))?;
|
|
||||||
|
|
||||||
if event::poll(tick_rate)? {
|
|
||||||
if let Event::Key(key) = event::read()? {
|
|
||||||
app.handle_key_event(key);
|
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
app.on_tick();
|
_ = ctrl_c => {
|
||||||
|
info!("Shutdown signal received");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cli.headless || cli.verbose > 0 {
|
||||||
|
info!("Dashboard shutdown complete");
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn drain_app_events(app: &mut App, receiver: &mut UnboundedReceiver<AppEvent>) {
|
|
||||||
loop {
|
|
||||||
match receiver.try_recv() {
|
|
||||||
Ok(event) => app.handle_app_event(event),
|
|
||||||
Err(TryRecvError::Empty) => break,
|
|
||||||
Err(TryRecvError::Disconnected) => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init_tracing(verbosity: u8) -> Result<()> {
|
|
||||||
let level = match verbosity {
|
|
||||||
0 => "warn",
|
|
||||||
1 => "info",
|
|
||||||
2 => "debug",
|
|
||||||
_ => "trace",
|
|
||||||
};
|
|
||||||
|
|
||||||
let env_filter = std::env::var("RUST_LOG")
|
|
||||||
.ok()
|
|
||||||
.and_then(|value| EnvFilter::try_new(value).ok())
|
|
||||||
.unwrap_or_else(|| EnvFilter::new(level));
|
|
||||||
|
|
||||||
let writer = prepare_log_writer()?;
|
|
||||||
|
|
||||||
tracing_subscriber::fmt()
|
|
||||||
.with_env_filter(env_filter)
|
|
||||||
.with_target(false)
|
|
||||||
.with_ansi(false)
|
|
||||||
.with_writer(writer)
|
|
||||||
.compact()
|
|
||||||
.try_init()
|
|
||||||
.map_err(|err| anyhow!(err))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prepare_log_writer() -> Result<tracing_appender::non_blocking::NonBlocking> {
|
|
||||||
let logs_dir = Path::new("logs");
|
|
||||||
if !logs_dir.exists() {
|
|
||||||
fs::create_dir_all(logs_dir).with_context(|| {
|
|
||||||
format!("failed to create logs directory at {}", logs_dir.display())
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let file_appender = tracing_appender::rolling::never(logs_dir, "cm-dashboard.log");
|
|
||||||
let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
|
|
||||||
LOG_GUARD.get_or_init(|| guard);
|
|
||||||
Ok(non_blocking)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn spawn_metrics_task(
|
|
||||||
context: ZmqContext,
|
|
||||||
sender: UnboundedSender<AppEvent>,
|
|
||||||
shutdown: Arc<AtomicBool>,
|
|
||||||
) -> JoinHandle<()> {
|
|
||||||
tokio::spawn(async move {
|
|
||||||
match spawn_blocking(move || metrics_blocking_loop(context, sender, shutdown)).await {
|
|
||||||
Ok(Ok(())) => {}
|
|
||||||
Ok(Err(error)) => warn!(%error, "ZMQ metrics worker exited with error"),
|
|
||||||
Err(join_error) => warn!(%join_error, "ZMQ metrics worker panicked"),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn metrics_blocking_loop(
|
|
||||||
context: ZmqContext,
|
|
||||||
sender: UnboundedSender<AppEvent>,
|
|
||||||
shutdown: Arc<AtomicBool>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let zmq_context = NativeZmqContext::new();
|
|
||||||
let socket = zmq_context
|
|
||||||
.socket(zmq::SUB)
|
|
||||||
.context("failed to create ZMQ SUB socket")?;
|
|
||||||
|
|
||||||
socket
|
|
||||||
.set_linger(0)
|
|
||||||
.context("failed to configure ZMQ linger")?;
|
|
||||||
socket
|
|
||||||
.set_rcvtimeo(1_000)
|
|
||||||
.context("failed to configure ZMQ receive timeout")?;
|
|
||||||
|
|
||||||
let mut connected_endpoints = 0;
|
|
||||||
for endpoint in context.endpoints() {
|
|
||||||
debug!(%endpoint, "attempting to connect to ZMQ endpoint");
|
|
||||||
match socket.connect(endpoint) {
|
|
||||||
Ok(()) => {
|
|
||||||
debug!(%endpoint, "successfully connected to ZMQ endpoint");
|
|
||||||
connected_endpoints += 1;
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
warn!(%endpoint, %error, "failed to connect to ZMQ endpoint, continuing with others");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if connected_endpoints == 0 {
|
|
||||||
return Err(anyhow!("failed to connect to any ZMQ endpoints"));
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("connected to {}/{} ZMQ endpoints", connected_endpoints, context.endpoints().len());
|
|
||||||
|
|
||||||
if let Some(prefix) = context.subscription() {
|
|
||||||
socket
|
|
||||||
.set_subscribe(prefix.as_bytes())
|
|
||||||
.context("failed to set ZMQ subscription")?;
|
|
||||||
} else {
|
|
||||||
socket
|
|
||||||
.set_subscribe(b"")
|
|
||||||
.context("failed to subscribe to all ZMQ topics")?;
|
|
||||||
}
|
|
||||||
|
|
||||||
while !shutdown.load(Ordering::Relaxed) {
|
|
||||||
match socket.recv_msg(0) {
|
|
||||||
Ok(message) => {
|
|
||||||
if let Err(error) = handle_zmq_message(&message, &sender) {
|
|
||||||
warn!(%error, "failed to handle ZMQ message");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
if error == zmq::Error::EAGAIN {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
warn!(%error, "ZMQ receive error");
|
|
||||||
std::thread::sleep(Duration::from_millis(250));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("ZMQ metrics worker shutting down");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn handle_zmq_message(
|
|
||||||
message: &NativeZmqMessage,
|
|
||||||
sender: &UnboundedSender<AppEvent>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let bytes = message.to_vec();
|
|
||||||
|
|
||||||
let envelope: MetricsEnvelope =
|
|
||||||
serde_json::from_slice(&bytes).with_context(|| "failed to deserialize metrics envelope")?;
|
|
||||||
let timestamp = Utc
|
|
||||||
.timestamp_opt(envelope.timestamp as i64, 0)
|
|
||||||
.single()
|
|
||||||
.unwrap_or_else(|| Utc::now());
|
|
||||||
|
|
||||||
let host = envelope.hostname.clone();
|
|
||||||
|
|
||||||
let mut payload = envelope.metrics;
|
|
||||||
if let Some(obj) = payload.as_object_mut() {
|
|
||||||
obj.entry("timestamp")
|
|
||||||
.or_insert_with(|| Value::String(timestamp.to_rfc3339()));
|
|
||||||
}
|
|
||||||
|
|
||||||
match envelope.agent_type {
|
|
||||||
AgentType::Smart => match serde_json::from_value::<SmartMetrics>(payload.clone()) {
|
|
||||||
Ok(metrics) => {
|
|
||||||
let _ = sender.send(AppEvent::MetricsUpdated {
|
|
||||||
host,
|
|
||||||
smart: Some(metrics),
|
|
||||||
services: None,
|
|
||||||
system: None,
|
|
||||||
backup: None,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
warn!(%error, "failed to parse smart metrics");
|
|
||||||
let _ = sender.send(AppEvent::MetricsFailed {
|
|
||||||
host,
|
|
||||||
error: format!("smart metrics parse error: {error:#}"),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
AgentType::Service => match serde_json::from_value::<ServiceMetrics>(payload.clone()) {
|
|
||||||
Ok(metrics) => {
|
|
||||||
let _ = sender.send(AppEvent::MetricsUpdated {
|
|
||||||
host,
|
|
||||||
smart: None,
|
|
||||||
services: Some(metrics),
|
|
||||||
system: None,
|
|
||||||
backup: None,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
warn!(%error, "failed to parse service metrics");
|
|
||||||
let _ = sender.send(AppEvent::MetricsFailed {
|
|
||||||
host,
|
|
||||||
error: format!("service metrics parse error: {error:#}"),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
AgentType::System => match serde_json::from_value::<SystemMetrics>(payload.clone()) {
|
|
||||||
Ok(metrics) => {
|
|
||||||
let _ = sender.send(AppEvent::MetricsUpdated {
|
|
||||||
host,
|
|
||||||
smart: None,
|
|
||||||
services: None,
|
|
||||||
system: Some(metrics),
|
|
||||||
backup: None,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
warn!(%error, "failed to parse system metrics");
|
|
||||||
let _ = sender.send(AppEvent::MetricsFailed {
|
|
||||||
host,
|
|
||||||
error: format!("system metrics parse error: {error:#}"),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
AgentType::Backup => match serde_json::from_value::<BackupMetrics>(payload.clone()) {
|
|
||||||
Ok(metrics) => {
|
|
||||||
let _ = sender.send(AppEvent::MetricsUpdated {
|
|
||||||
host,
|
|
||||||
smart: None,
|
|
||||||
services: None,
|
|
||||||
system: None,
|
|
||||||
backup: Some(metrics),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
warn!(%error, "failed to parse backup metrics");
|
|
||||||
let _ = sender.send(AppEvent::MetricsFailed {
|
|
||||||
host,
|
|
||||||
error: format!("backup metrics parse error: {error:#}"),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ensure_default_config(cli: &Cli) -> Result<()> {
|
|
||||||
if let Some(path) = cli.config.as_ref() {
|
|
||||||
ensure_config_at(path, false)?;
|
|
||||||
} else {
|
|
||||||
let default_path = Path::new("config/dashboard.toml");
|
|
||||||
if !default_path.exists() {
|
|
||||||
generate_config_templates(Path::new("config"), false)?;
|
|
||||||
println!("Created default configuration in ./config");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ensure_config_at(path: &Path, force: bool) -> Result<()> {
|
|
||||||
if path.exists() && !force {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(parent) = path.parent() {
|
|
||||||
if !parent.exists() {
|
|
||||||
fs::create_dir_all(parent)
|
|
||||||
.with_context(|| format!("failed to create directory {}", parent.display()))?;
|
|
||||||
}
|
|
||||||
|
|
||||||
write_template(path.to_path_buf(), DASHBOARD_TEMPLATE, force, "dashboard")?;
|
|
||||||
|
|
||||||
let hosts_path = parent.join("hosts.toml");
|
|
||||||
if !hosts_path.exists() || force {
|
|
||||||
write_template(hosts_path, HOSTS_TEMPLATE, force, "hosts")?;
|
|
||||||
}
|
|
||||||
println!(
|
|
||||||
"Created configuration templates in {} (dashboard: {})",
|
|
||||||
parent.display(),
|
|
||||||
path.display()
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
return Err(anyhow!("invalid configuration path {}", path.display()));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn generate_config_templates(target_dir: &Path, force: bool) -> Result<()> {
|
|
||||||
if !target_dir.exists() {
|
|
||||||
fs::create_dir_all(target_dir)
|
|
||||||
.with_context(|| format!("failed to create directory {}", target_dir.display()))?;
|
|
||||||
}
|
|
||||||
|
|
||||||
write_template(
|
|
||||||
target_dir.join("dashboard.toml"),
|
|
||||||
DASHBOARD_TEMPLATE,
|
|
||||||
force,
|
|
||||||
"dashboard",
|
|
||||||
)?;
|
|
||||||
write_template(
|
|
||||||
target_dir.join("hosts.toml"),
|
|
||||||
HOSTS_TEMPLATE,
|
|
||||||
force,
|
|
||||||
"hosts",
|
|
||||||
)?;
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"Configuration templates written to {}",
|
|
||||||
target_dir.display()
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_template(path: PathBuf, contents: &str, force: bool, name: &str) -> Result<()> {
|
|
||||||
if path.exists() && !force {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"{} template already exists at {} (use --force to overwrite)",
|
|
||||||
name,
|
|
||||||
path.display()
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
fs::write(&path, contents)
|
|
||||||
.with_context(|| format!("failed to write {} template to {}", name, path.display()))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
const DASHBOARD_TEMPLATE: &str = r#"# CM Dashboard configuration
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
enabled = true
|
|
||||||
# metadata = { rack = "R1" }
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[dashboard]
|
|
||||||
tick_rate_ms = 250
|
|
||||||
history_duration_minutes = 60
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "storage"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "services"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "backup"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[dashboard.widgets]]
|
|
||||||
id = "alerts"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[filesystem]
|
|
||||||
# cache_dir = "/var/lib/cm-dashboard/cache"
|
|
||||||
# history_dir = "/var/lib/cm-dashboard/history"
|
|
||||||
"#;
|
|
||||||
|
|
||||||
const HOSTS_TEMPLATE: &str = r#"# Optional separate hosts configuration
|
|
||||||
|
|
||||||
[hosts]
|
|
||||||
# default_host = "srv01"
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "srv01"
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[[hosts.hosts]]
|
|
||||||
name = "labbox"
|
|
||||||
enabled = true
|
|
||||||
"#;
|
|
||||||
|
|||||||
11
dashboard/src/metrics/mod.rs
Normal file
11
dashboard/src/metrics/mod.rs
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
pub mod store;
|
||||||
|
|
||||||
|
pub use store::MetricStore;
|
||||||
|
|
||||||
|
/// Historical metric data point
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct MetricDataPoint {
|
||||||
|
pub received_at: Instant,
|
||||||
|
}
|
||||||
175
dashboard/src/metrics/store.rs
Normal file
175
dashboard/src/metrics/store.rs
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
use cm_dashboard_shared::Metric;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
|
use super::MetricDataPoint;
|
||||||
|
|
||||||
|
/// Central metric storage for the dashboard
|
||||||
|
pub struct MetricStore {
|
||||||
|
/// Current metrics: hostname -> metric_name -> metric
|
||||||
|
current_metrics: HashMap<String, HashMap<String, Metric>>,
|
||||||
|
/// Historical metrics for trending
|
||||||
|
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
|
||||||
|
/// Last update timestamp per host
|
||||||
|
last_update: HashMap<String, Instant>,
|
||||||
|
/// Configuration
|
||||||
|
max_metrics_per_host: usize,
|
||||||
|
history_retention: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricStore {
|
||||||
|
pub fn new(max_metrics_per_host: usize, history_retention_hours: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
current_metrics: HashMap::new(),
|
||||||
|
historical_metrics: HashMap::new(),
|
||||||
|
last_update: HashMap::new(),
|
||||||
|
max_metrics_per_host,
|
||||||
|
history_retention: Duration::from_secs(history_retention_hours * 3600),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update metrics for a specific host
|
||||||
|
pub fn update_metrics(&mut self, hostname: &str, metrics: Vec<Metric>) {
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
debug!("Updating {} metrics for host {}", metrics.len(), hostname);
|
||||||
|
|
||||||
|
// Get or create host entry
|
||||||
|
let host_metrics = self
|
||||||
|
.current_metrics
|
||||||
|
.entry(hostname.to_string())
|
||||||
|
.or_insert_with(HashMap::new);
|
||||||
|
|
||||||
|
// Get or create historical entry
|
||||||
|
let host_history = self
|
||||||
|
.historical_metrics
|
||||||
|
.entry(hostname.to_string())
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
// Update current metrics and add to history
|
||||||
|
for metric in metrics {
|
||||||
|
let metric_name = metric.name.clone();
|
||||||
|
|
||||||
|
// Store current metric
|
||||||
|
host_metrics.insert(metric_name.clone(), metric.clone());
|
||||||
|
|
||||||
|
// Add to history
|
||||||
|
host_history.push(MetricDataPoint { received_at: now });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update last update timestamp
|
||||||
|
self.last_update.insert(hostname.to_string(), now);
|
||||||
|
|
||||||
|
// Get metrics count before cleanup
|
||||||
|
let metrics_count = host_metrics.len();
|
||||||
|
|
||||||
|
// Cleanup old history and enforce limits
|
||||||
|
self.cleanup_host_data(hostname);
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Updated metrics for {}: {} current metrics",
|
||||||
|
hostname, metrics_count
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current metric for a specific host
|
||||||
|
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric> {
|
||||||
|
self.current_metrics.get(hostname)?.get(metric_name)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Get all current metrics for a host as a vector
|
||||||
|
pub fn get_metrics_for_host(&self, hostname: &str) -> Vec<&Metric> {
|
||||||
|
if let Some(metrics_map) = self.current_metrics.get(hostname) {
|
||||||
|
metrics_map.values().collect()
|
||||||
|
} else {
|
||||||
|
Vec::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get connected hosts (hosts with recent updates)
|
||||||
|
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
self.last_update
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(hostname, &last_update)| {
|
||||||
|
if now.duration_since(last_update) <= timeout {
|
||||||
|
Some(hostname.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cleanup old data and enforce limits
|
||||||
|
fn cleanup_host_data(&mut self, hostname: &str) {
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
// Cleanup historical data
|
||||||
|
if let Some(history) = self.historical_metrics.get_mut(hostname) {
|
||||||
|
// Remove old entries
|
||||||
|
history.retain(|dp| now.duration_since(dp.received_at) <= self.history_retention);
|
||||||
|
|
||||||
|
// Enforce size limit
|
||||||
|
if history.len() > self.max_metrics_per_host {
|
||||||
|
let excess = history.len() - self.max_metrics_per_host;
|
||||||
|
history.drain(0..excess);
|
||||||
|
warn!(
|
||||||
|
"Trimmed {} old metrics for host {} (size limit: {})",
|
||||||
|
excess, hostname, self.max_metrics_per_host
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get agent versions from all hosts for cross-host comparison
|
||||||
|
pub fn get_agent_versions(&self) -> HashMap<String, String> {
|
||||||
|
let mut versions = HashMap::new();
|
||||||
|
|
||||||
|
for (hostname, metrics) in &self.current_metrics {
|
||||||
|
if let Some(version_metric) = metrics.get("agent_version") {
|
||||||
|
if let cm_dashboard_shared::MetricValue::String(version) = &version_metric.value {
|
||||||
|
versions.insert(hostname.clone(), version.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
versions
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check for agent version mismatches across hosts
|
||||||
|
pub fn get_version_mismatches(&self) -> Option<(String, Vec<String>)> {
|
||||||
|
let versions = self.get_agent_versions();
|
||||||
|
|
||||||
|
if versions.len() < 2 {
|
||||||
|
return None; // Need at least 2 hosts to compare
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the most common version (assume it's the "current" version)
|
||||||
|
let mut version_counts = HashMap::new();
|
||||||
|
for version in versions.values() {
|
||||||
|
*version_counts.entry(version.clone()).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let most_common_version = version_counts
|
||||||
|
.iter()
|
||||||
|
.max_by_key(|(_, count)| *count)
|
||||||
|
.map(|(version, _)| version.clone())?;
|
||||||
|
|
||||||
|
// Find hosts with different versions
|
||||||
|
let outdated_hosts: Vec<String> = versions
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, version)| *version != &most_common_version)
|
||||||
|
.map(|(hostname, _)| hostname.clone())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if outdated_hosts.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some((most_common_version, outdated_hosts))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,110 +0,0 @@
|
|||||||
use ratatui::layout::Rect;
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
use crate::app::HostDisplayData;
|
|
||||||
use crate::data::metrics::BackupMetrics;
|
|
||||||
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel};
|
|
||||||
use crate::app::ConnectionStatus;
|
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
|
||||||
match host {
|
|
||||||
Some(data) => {
|
|
||||||
match (&data.connection_status, data.backup.as_ref()) {
|
|
||||||
(ConnectionStatus::Connected, Some(metrics)) => {
|
|
||||||
render_metrics(frame, data, metrics, area);
|
|
||||||
}
|
|
||||||
(ConnectionStatus::Connected, None) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"Backups",
|
|
||||||
&format!("Host {} awaiting backup metrics", data.name),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
(status, _) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"Backups",
|
|
||||||
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => render_placeholder(frame, area, "Backups", "No hosts configured"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &BackupMetrics, area: Rect) {
|
|
||||||
let widget_status = status_level_from_agent_status(Some(&metrics.overall_status));
|
|
||||||
|
|
||||||
let mut data = WidgetData::new(
|
|
||||||
"Backups",
|
|
||||||
Some(WidgetStatus::new(widget_status)),
|
|
||||||
vec!["Backup".to_string(), "Status".to_string(), "Details".to_string()]
|
|
||||||
);
|
|
||||||
|
|
||||||
// Latest backup
|
|
||||||
let (latest_status, latest_time) = if let Some(last_success) = metrics.backup.last_success.as_ref() {
|
|
||||||
let hours_ago = chrono::Utc::now().signed_duration_since(*last_success).num_hours();
|
|
||||||
let time_str = if hours_ago < 24 {
|
|
||||||
format!("{}h ago", hours_ago)
|
|
||||||
} else {
|
|
||||||
format!("{}d ago", hours_ago / 24)
|
|
||||||
};
|
|
||||||
(StatusLevel::Ok, time_str)
|
|
||||||
} else {
|
|
||||||
(StatusLevel::Warning, "Never".to_string())
|
|
||||||
};
|
|
||||||
|
|
||||||
data.add_row(
|
|
||||||
Some(WidgetStatus::new(latest_status)),
|
|
||||||
vec![format!("Archives: {}, {:.1}GB total", metrics.backup.snapshot_count, metrics.backup.size_gb)],
|
|
||||||
vec![
|
|
||||||
"Latest".to_string(),
|
|
||||||
latest_time,
|
|
||||||
format!("{:.1}GB", metrics.backup.latest_archive_size_gb.unwrap_or(metrics.backup.size_gb)),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
|
|
||||||
// Disk usage
|
|
||||||
if let Some(disk) = &metrics.disk {
|
|
||||||
let disk_status = match disk.health.as_str() {
|
|
||||||
"ok" => StatusLevel::Ok,
|
|
||||||
"failed" => StatusLevel::Error,
|
|
||||||
_ => StatusLevel::Warning,
|
|
||||||
};
|
|
||||||
|
|
||||||
data.add_row(
|
|
||||||
Some(WidgetStatus::new(disk_status)),
|
|
||||||
vec![],
|
|
||||||
vec![
|
|
||||||
"Disk".to_string(),
|
|
||||||
disk.health.clone(),
|
|
||||||
{
|
|
||||||
let used_mb = disk.used_gb * 1000.0;
|
|
||||||
let used_str = if used_mb < 1000.0 {
|
|
||||||
format!("{:.0}MB", used_mb)
|
|
||||||
} else {
|
|
||||||
format!("{:.1}GB", disk.used_gb)
|
|
||||||
};
|
|
||||||
format!("{} ({}GB)", used_str, disk.total_gb.round() as u32)
|
|
||||||
},
|
|
||||||
],
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
data.add_row(
|
|
||||||
Some(WidgetStatus::new(StatusLevel::Unknown)),
|
|
||||||
vec![],
|
|
||||||
vec![
|
|
||||||
"Disk".to_string(),
|
|
||||||
"Unknown".to_string(),
|
|
||||||
"—".to_string(),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
render_widget_data(frame, area, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,124 +0,0 @@
|
|||||||
use ratatui::layout::{Constraint, Direction, Layout, Rect};
|
|
||||||
use ratatui::style::{Color, Modifier, Style};
|
|
||||||
use ratatui::text::Span;
|
|
||||||
use ratatui::widgets::Block;
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
use crate::app::App;
|
|
||||||
|
|
||||||
use super::{hosts, backup, services, storage, system};
|
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, app: &App) {
|
|
||||||
let host_summaries = app.host_display_data();
|
|
||||||
let primary_host = app.active_host_display();
|
|
||||||
|
|
||||||
let title = if let Some(host) = primary_host.as_ref() {
|
|
||||||
format!("CM Dashboard • {}", host.name)
|
|
||||||
} else {
|
|
||||||
"CM Dashboard".to_string()
|
|
||||||
};
|
|
||||||
|
|
||||||
let root_block = Block::default().title(Span::styled(
|
|
||||||
title,
|
|
||||||
Style::default()
|
|
||||||
.fg(Color::Cyan)
|
|
||||||
.add_modifier(Modifier::BOLD),
|
|
||||||
));
|
|
||||||
|
|
||||||
let size = frame.size();
|
|
||||||
frame.render_widget(root_block, size);
|
|
||||||
|
|
||||||
let outer = inner_rect(size);
|
|
||||||
|
|
||||||
let main_columns = Layout::default()
|
|
||||||
.direction(Direction::Horizontal)
|
|
||||||
.constraints([Constraint::Percentage(50), Constraint::Percentage(50)])
|
|
||||||
.split(outer);
|
|
||||||
|
|
||||||
let left_side = Layout::default()
|
|
||||||
.direction(Direction::Vertical)
|
|
||||||
.constraints([Constraint::Percentage(75), Constraint::Percentage(25)])
|
|
||||||
.split(main_columns[0]);
|
|
||||||
|
|
||||||
let left_widgets = Layout::default()
|
|
||||||
.direction(Direction::Vertical)
|
|
||||||
.constraints([
|
|
||||||
Constraint::Ratio(1, 3),
|
|
||||||
Constraint::Ratio(1, 3),
|
|
||||||
Constraint::Ratio(1, 3),
|
|
||||||
])
|
|
||||||
.split(left_side[0]);
|
|
||||||
|
|
||||||
let services_area = main_columns[1];
|
|
||||||
|
|
||||||
system::render(frame, primary_host.as_ref(), left_widgets[0]);
|
|
||||||
storage::render(frame, primary_host.as_ref(), left_widgets[1]);
|
|
||||||
backup::render(frame, primary_host.as_ref(), left_widgets[2]);
|
|
||||||
services::render(frame, primary_host.as_ref(), services_area);
|
|
||||||
|
|
||||||
hosts::render(frame, &host_summaries, left_side[1]);
|
|
||||||
|
|
||||||
if app.help_visible() {
|
|
||||||
render_help(frame, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn inner_rect(area: Rect) -> Rect {
|
|
||||||
Rect {
|
|
||||||
x: area.x + 1,
|
|
||||||
y: area.y + 1,
|
|
||||||
width: area.width.saturating_sub(2),
|
|
||||||
height: area.height.saturating_sub(2),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_help(frame: &mut Frame, area: Rect) {
|
|
||||||
use ratatui::text::Line;
|
|
||||||
use ratatui::widgets::{Block, Borders, Clear, Paragraph, Wrap};
|
|
||||||
|
|
||||||
let help_area = centered_rect(60, 40, area);
|
|
||||||
let lines = vec![
|
|
||||||
Line::from("Keyboard Shortcuts"),
|
|
||||||
Line::from("←/→ or h/l: Switch active host"),
|
|
||||||
Line::from("r: Manual refresh status"),
|
|
||||||
Line::from("?: Toggle this help"),
|
|
||||||
Line::from("q / Esc: Quit dashboard"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let block = Block::default()
|
|
||||||
.title(Span::styled(
|
|
||||||
"Help",
|
|
||||||
Style::default()
|
|
||||||
.fg(Color::White)
|
|
||||||
.add_modifier(Modifier::BOLD),
|
|
||||||
))
|
|
||||||
.borders(Borders::ALL)
|
|
||||||
.style(Style::default().bg(Color::Black));
|
|
||||||
|
|
||||||
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block);
|
|
||||||
|
|
||||||
frame.render_widget(Clear, help_area);
|
|
||||||
frame.render_widget(paragraph, help_area);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn centered_rect(percent_x: u16, percent_y: u16, area: Rect) -> Rect {
|
|
||||||
let vertical = Layout::default()
|
|
||||||
.direction(Direction::Vertical)
|
|
||||||
.constraints([
|
|
||||||
Constraint::Percentage((100 - percent_y) / 2),
|
|
||||||
Constraint::Percentage(percent_y),
|
|
||||||
Constraint::Percentage((100 - percent_y) / 2),
|
|
||||||
])
|
|
||||||
.split(area);
|
|
||||||
|
|
||||||
let horizontal = Layout::default()
|
|
||||||
.direction(Direction::Horizontal)
|
|
||||||
.constraints([
|
|
||||||
Constraint::Percentage((100 - percent_x) / 2),
|
|
||||||
Constraint::Percentage(percent_x),
|
|
||||||
Constraint::Percentage((100 - percent_x) / 2),
|
|
||||||
])
|
|
||||||
.split(vertical[1]);
|
|
||||||
|
|
||||||
horizontal[1]
|
|
||||||
}
|
|
||||||
@@ -1,296 +0,0 @@
|
|||||||
use chrono::{DateTime, Utc};
|
|
||||||
use ratatui::layout::Rect;
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
use crate::app::{HostDisplayData, ConnectionStatus};
|
|
||||||
// Removed: evaluate_performance and PerfSeverity no longer needed
|
|
||||||
use crate::ui::widget::{render_widget_data, WidgetData, WidgetStatus, StatusLevel};
|
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, hosts: &[HostDisplayData], area: Rect) {
|
|
||||||
let (severity, _ok_count, _warn_count, _fail_count) = classify_hosts(hosts);
|
|
||||||
|
|
||||||
let title = "Hosts".to_string();
|
|
||||||
|
|
||||||
let widget_status = match severity {
|
|
||||||
HostSeverity::Critical => StatusLevel::Error,
|
|
||||||
HostSeverity::Warning => StatusLevel::Warning,
|
|
||||||
HostSeverity::Healthy => StatusLevel::Ok,
|
|
||||||
HostSeverity::Unknown => StatusLevel::Unknown,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut data = WidgetData::new(
|
|
||||||
title,
|
|
||||||
Some(WidgetStatus::new(widget_status)),
|
|
||||||
vec!["Host".to_string(), "Status".to_string(), "Timestamp".to_string()]
|
|
||||||
);
|
|
||||||
|
|
||||||
if hosts.is_empty() {
|
|
||||||
data.add_row(
|
|
||||||
None,
|
|
||||||
vec![],
|
|
||||||
vec![
|
|
||||||
"No hosts configured".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
for host in hosts {
|
|
||||||
let (status_text, severity, _emphasize) = host_status(host);
|
|
||||||
let status_level = match severity {
|
|
||||||
HostSeverity::Critical => StatusLevel::Error,
|
|
||||||
HostSeverity::Warning => StatusLevel::Warning,
|
|
||||||
HostSeverity::Healthy => StatusLevel::Ok,
|
|
||||||
HostSeverity::Unknown => StatusLevel::Unknown,
|
|
||||||
};
|
|
||||||
let update = latest_timestamp(host)
|
|
||||||
.map(|ts| ts.format("%Y-%m-%d %H:%M:%S").to_string())
|
|
||||||
.unwrap_or_else(|| "—".to_string());
|
|
||||||
|
|
||||||
data.add_row(
|
|
||||||
Some(WidgetStatus::new(status_level)),
|
|
||||||
vec![],
|
|
||||||
vec![
|
|
||||||
host.name.clone(),
|
|
||||||
status_text,
|
|
||||||
update,
|
|
||||||
],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
render_widget_data(frame, area, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
|
||||||
enum HostSeverity {
|
|
||||||
Healthy,
|
|
||||||
Warning,
|
|
||||||
Critical,
|
|
||||||
Unknown,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn classify_hosts(hosts: &[HostDisplayData]) -> (HostSeverity, usize, usize, usize) {
|
|
||||||
let mut ok = 0;
|
|
||||||
let mut warn = 0;
|
|
||||||
let mut fail = 0;
|
|
||||||
|
|
||||||
for host in hosts {
|
|
||||||
let severity = host_severity(host);
|
|
||||||
match severity {
|
|
||||||
HostSeverity::Healthy => ok += 1,
|
|
||||||
HostSeverity::Warning => warn += 1,
|
|
||||||
HostSeverity::Critical => fail += 1,
|
|
||||||
HostSeverity::Unknown => warn += 1,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let highest = if fail > 0 {
|
|
||||||
HostSeverity::Critical
|
|
||||||
} else if warn > 0 {
|
|
||||||
HostSeverity::Warning
|
|
||||||
} else if ok > 0 {
|
|
||||||
HostSeverity::Healthy
|
|
||||||
} else {
|
|
||||||
HostSeverity::Unknown
|
|
||||||
};
|
|
||||||
|
|
||||||
(highest, ok, warn, fail)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn host_severity(host: &HostDisplayData) -> HostSeverity {
|
|
||||||
// Check connection status first
|
|
||||||
match host.connection_status {
|
|
||||||
ConnectionStatus::Error => return HostSeverity::Critical,
|
|
||||||
ConnectionStatus::Timeout => return HostSeverity::Warning,
|
|
||||||
ConnectionStatus::Unknown => return HostSeverity::Unknown,
|
|
||||||
ConnectionStatus::Connected => {}, // Continue with other checks
|
|
||||||
}
|
|
||||||
|
|
||||||
if host.last_error.is_some() {
|
|
||||||
return HostSeverity::Critical;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(smart) = host.smart.as_ref() {
|
|
||||||
if smart.summary.critical > 0 {
|
|
||||||
return HostSeverity::Critical;
|
|
||||||
}
|
|
||||||
if smart.summary.warning > 0 || !smart.issues.is_empty() {
|
|
||||||
return HostSeverity::Warning;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(services) = host.services.as_ref() {
|
|
||||||
if services.summary.failed > 0 {
|
|
||||||
return HostSeverity::Critical;
|
|
||||||
}
|
|
||||||
if services.summary.degraded > 0 {
|
|
||||||
return HostSeverity::Warning;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Update to use agent-provided system statuses instead of evaluate_performance
|
|
||||||
// let (perf_severity, _) = evaluate_performance(&services.summary);
|
|
||||||
// match perf_severity {
|
|
||||||
// PerfSeverity::Critical => return HostSeverity::Critical,
|
|
||||||
// PerfSeverity::Warning => return HostSeverity::Warning,
|
|
||||||
// PerfSeverity::Ok => {}
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(backup) = host.backup.as_ref() {
|
|
||||||
match backup.overall_status.as_str() {
|
|
||||||
"critical" => return HostSeverity::Critical,
|
|
||||||
"warning" => return HostSeverity::Warning,
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if host.smart.is_none() && host.services.is_none() && host.backup.is_none() {
|
|
||||||
HostSeverity::Unknown
|
|
||||||
} else {
|
|
||||||
HostSeverity::Healthy
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn host_status(host: &HostDisplayData) -> (String, HostSeverity, bool) {
|
|
||||||
// Check connection status first
|
|
||||||
match host.connection_status {
|
|
||||||
ConnectionStatus::Error => {
|
|
||||||
let msg = if let Some(error) = &host.last_error {
|
|
||||||
format!("Connection error: {}", error)
|
|
||||||
} else {
|
|
||||||
"Connection error".to_string()
|
|
||||||
};
|
|
||||||
return (msg, HostSeverity::Critical, true);
|
|
||||||
},
|
|
||||||
ConnectionStatus::Timeout => {
|
|
||||||
let msg = if let Some(error) = &host.last_error {
|
|
||||||
format!("Keep-alive timeout: {}", error)
|
|
||||||
} else {
|
|
||||||
"Keep-alive timeout".to_string()
|
|
||||||
};
|
|
||||||
return (msg, HostSeverity::Warning, true);
|
|
||||||
},
|
|
||||||
ConnectionStatus::Unknown => {
|
|
||||||
return ("No data received".to_string(), HostSeverity::Unknown, true);
|
|
||||||
},
|
|
||||||
ConnectionStatus::Connected => {}, // Continue with other checks
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(error) = &host.last_error {
|
|
||||||
return (format!("error: {}", error), HostSeverity::Critical, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(smart) = host.smart.as_ref() {
|
|
||||||
if smart.summary.critical > 0 {
|
|
||||||
return (
|
|
||||||
"critical: SMART critical".to_string(),
|
|
||||||
HostSeverity::Critical,
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if let Some(issue) = smart.issues.first() {
|
|
||||||
return (format!("warning: {}", issue), HostSeverity::Warning, true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(services) = host.services.as_ref() {
|
|
||||||
if services.summary.failed > 0 {
|
|
||||||
return (
|
|
||||||
format!("critical: {} failed svc", services.summary.failed),
|
|
||||||
HostSeverity::Critical,
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if services.summary.degraded > 0 {
|
|
||||||
return (
|
|
||||||
format!("warning: {} degraded svc", services.summary.degraded),
|
|
||||||
HostSeverity::Warning,
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Update to use agent-provided system statuses instead of evaluate_performance
|
|
||||||
// let (perf_severity, reason) = evaluate_performance(&services.summary);
|
|
||||||
// if let Some(reason_text) = reason {
|
|
||||||
// match perf_severity {
|
|
||||||
// PerfSeverity::Critical => {
|
|
||||||
// return (
|
|
||||||
// format!("critical: {}", reason_text),
|
|
||||||
// HostSeverity::Critical,
|
|
||||||
// true,
|
|
||||||
// );
|
|
||||||
// }
|
|
||||||
// PerfSeverity::Warning => {
|
|
||||||
// return (
|
|
||||||
// format!("warning: {}", reason_text),
|
|
||||||
// HostSeverity::Warning,
|
|
||||||
// true,
|
|
||||||
// );
|
|
||||||
// }
|
|
||||||
// PerfSeverity::Ok => {}
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(backup) = host.backup.as_ref() {
|
|
||||||
match backup.overall_status.as_str() {
|
|
||||||
"critical" => {
|
|
||||||
return (
|
|
||||||
"critical: backup failed".to_string(),
|
|
||||||
HostSeverity::Critical,
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
"warning" => {
|
|
||||||
return (
|
|
||||||
"warning: backup warning".to_string(),
|
|
||||||
HostSeverity::Warning,
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if host.smart.is_none() && host.services.is_none() && host.backup.is_none() {
|
|
||||||
let status = if host.last_success.is_none() {
|
|
||||||
"pending: awaiting metrics"
|
|
||||||
} else {
|
|
||||||
"pending: no recent data"
|
|
||||||
};
|
|
||||||
|
|
||||||
return (status.to_string(), HostSeverity::Warning, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
("ok".to_string(), HostSeverity::Healthy, false)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
fn latest_timestamp(host: &HostDisplayData) -> Option<DateTime<Utc>> {
|
|
||||||
let mut latest = host.last_success;
|
|
||||||
|
|
||||||
if let Some(smart) = host.smart.as_ref() {
|
|
||||||
latest = Some(match latest {
|
|
||||||
Some(current) => current.max(smart.timestamp),
|
|
||||||
None => smart.timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(services) = host.services.as_ref() {
|
|
||||||
latest = Some(match latest {
|
|
||||||
Some(current) => current.max(services.timestamp),
|
|
||||||
None => services.timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(backup) = host.backup.as_ref() {
|
|
||||||
latest = Some(match latest {
|
|
||||||
Some(current) => current.max(backup.timestamp),
|
|
||||||
None => backup.timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
latest
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,9 +1,789 @@
|
|||||||
pub mod hosts;
|
use anyhow::Result;
|
||||||
pub mod backup;
|
use crossterm::event::{Event, KeyCode};
|
||||||
pub mod dashboard;
|
use ratatui::{
|
||||||
pub mod services;
|
layout::{Constraint, Direction, Layout, Rect},
|
||||||
pub mod storage;
|
style::Style,
|
||||||
pub mod system;
|
widgets::{Block, Paragraph},
|
||||||
pub mod widget;
|
Frame,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
pub use dashboard::render;
|
pub mod theme;
|
||||||
|
pub mod widgets;
|
||||||
|
|
||||||
|
use crate::config::DashboardConfig;
|
||||||
|
use crate::metrics::MetricStore;
|
||||||
|
use cm_dashboard_shared::{Metric, Status};
|
||||||
|
use theme::{Components, Layout as ThemeLayout, Theme, Typography};
|
||||||
|
use widgets::{BackupWidget, ServicesWidget, SystemWidget, Widget};
|
||||||
|
|
||||||
|
/// Commands that can be triggered from the UI
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum UiCommand {
|
||||||
|
ServiceStart { hostname: String, service_name: String },
|
||||||
|
ServiceStop { hostname: String, service_name: String },
|
||||||
|
TriggerBackup { hostname: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Types of commands for status tracking
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum CommandType {
|
||||||
|
ServiceStart,
|
||||||
|
ServiceStop,
|
||||||
|
BackupTrigger,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Panel types for focus management
|
||||||
|
|
||||||
|
/// Widget states for a specific host
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct HostWidgets {
|
||||||
|
/// System widget state (includes CPU, Memory, NixOS info, Storage)
|
||||||
|
pub system_widget: SystemWidget,
|
||||||
|
/// Services widget state
|
||||||
|
pub services_widget: ServicesWidget,
|
||||||
|
/// Backup widget state
|
||||||
|
pub backup_widget: BackupWidget,
|
||||||
|
/// Scroll offsets for each panel
|
||||||
|
pub system_scroll_offset: usize,
|
||||||
|
pub services_scroll_offset: usize,
|
||||||
|
pub backup_scroll_offset: usize,
|
||||||
|
/// Last update time for this host
|
||||||
|
pub last_update: Option<Instant>,
|
||||||
|
/// Pending service transitions for immediate visual feedback
|
||||||
|
pub pending_service_transitions: HashMap<String, (CommandType, String, Instant)>, // service_name -> (command_type, original_status, start_time)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HostWidgets {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
system_widget: SystemWidget::new(),
|
||||||
|
services_widget: ServicesWidget::new(),
|
||||||
|
backup_widget: BackupWidget::new(),
|
||||||
|
system_scroll_offset: 0,
|
||||||
|
services_scroll_offset: 0,
|
||||||
|
backup_scroll_offset: 0,
|
||||||
|
last_update: None,
|
||||||
|
pending_service_transitions: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Main TUI application
|
||||||
|
pub struct TuiApp {
|
||||||
|
/// Widget states per host (hostname -> HostWidgets)
|
||||||
|
host_widgets: HashMap<String, HostWidgets>,
|
||||||
|
/// Current active host
|
||||||
|
current_host: Option<String>,
|
||||||
|
/// Available hosts
|
||||||
|
available_hosts: Vec<String>,
|
||||||
|
/// Host index for navigation
|
||||||
|
host_index: usize,
|
||||||
|
/// Should quit application
|
||||||
|
should_quit: bool,
|
||||||
|
/// Track if user manually navigated away from localhost
|
||||||
|
user_navigated_away: bool,
|
||||||
|
/// Dashboard configuration
|
||||||
|
config: DashboardConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TuiApp {
|
||||||
|
pub fn new(config: DashboardConfig) -> Self {
|
||||||
|
Self {
|
||||||
|
host_widgets: HashMap::new(),
|
||||||
|
current_host: None,
|
||||||
|
available_hosts: Vec::new(),
|
||||||
|
host_index: 0,
|
||||||
|
should_quit: false,
|
||||||
|
user_navigated_away: false,
|
||||||
|
config,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get or create host widgets for the given hostname
|
||||||
|
fn get_or_create_host_widgets(&mut self, hostname: &str) -> &mut HostWidgets {
|
||||||
|
self.host_widgets
|
||||||
|
.entry(hostname.to_string())
|
||||||
|
.or_insert_with(HostWidgets::new)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update widgets with metrics from store (only for current host)
|
||||||
|
pub fn update_metrics(&mut self, metric_store: &MetricStore) {
|
||||||
|
|
||||||
|
// Check for rebuild completion by agent hash change
|
||||||
|
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
// Only update widgets if we have metrics for this host
|
||||||
|
let all_metrics = metric_store.get_metrics_for_host(&hostname);
|
||||||
|
if !all_metrics.is_empty() {
|
||||||
|
// Get metrics first while hostname is borrowed
|
||||||
|
let cpu_metrics: Vec<&Metric> = all_metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| {
|
||||||
|
m.name.starts_with("cpu_")
|
||||||
|
|| m.name.contains("c_state_")
|
||||||
|
|| m.name.starts_with("process_top_")
|
||||||
|
})
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
let memory_metrics: Vec<&Metric> = all_metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.name.starts_with("memory_") || m.name.starts_with("disk_tmp_"))
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
let service_metrics: Vec<&Metric> = all_metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.name.starts_with("service_"))
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
let all_backup_metrics: Vec<&Metric> = all_metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.name.starts_with("backup_"))
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Clear completed transitions first
|
||||||
|
self.clear_completed_transitions(&hostname, &service_metrics);
|
||||||
|
|
||||||
|
// Now get host widgets and update them
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
|
||||||
|
// Collect all system metrics (CPU, memory, NixOS, disk/storage)
|
||||||
|
let mut system_metrics = cpu_metrics;
|
||||||
|
system_metrics.extend(memory_metrics);
|
||||||
|
|
||||||
|
// Add NixOS metrics - using exact matching for build display fix
|
||||||
|
let nixos_metrics: Vec<&Metric> = all_metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.name == "system_nixos_build" || m.name == "system_active_users" || m.name == "agent_version")
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
system_metrics.extend(nixos_metrics);
|
||||||
|
|
||||||
|
// Add disk/storage metrics
|
||||||
|
let disk_metrics: Vec<&Metric> = all_metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.name.starts_with("disk_"))
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
system_metrics.extend(disk_metrics);
|
||||||
|
|
||||||
|
host_widgets.system_widget.update_from_metrics(&system_metrics);
|
||||||
|
host_widgets
|
||||||
|
.services_widget
|
||||||
|
.update_from_metrics(&service_metrics);
|
||||||
|
host_widgets
|
||||||
|
.backup_widget
|
||||||
|
.update_from_metrics(&all_backup_metrics);
|
||||||
|
|
||||||
|
host_widgets.last_update = Some(Instant::now());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update available hosts with localhost prioritization
|
||||||
|
pub fn update_hosts(&mut self, hosts: Vec<String>) {
|
||||||
|
// Sort hosts alphabetically
|
||||||
|
let mut sorted_hosts = hosts.clone();
|
||||||
|
|
||||||
|
// Keep hosts that have pending transitions even if they're offline
|
||||||
|
for (hostname, host_widgets) in &self.host_widgets {
|
||||||
|
if !host_widgets.pending_service_transitions.is_empty() {
|
||||||
|
if !sorted_hosts.contains(hostname) {
|
||||||
|
sorted_hosts.push(hostname.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sorted_hosts.sort();
|
||||||
|
self.available_hosts = sorted_hosts;
|
||||||
|
|
||||||
|
// Get the current hostname (localhost) for auto-selection
|
||||||
|
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||||
|
if !self.available_hosts.is_empty() {
|
||||||
|
if self.available_hosts.contains(&localhost) && !self.user_navigated_away {
|
||||||
|
// Localhost is available and user hasn't navigated away - switch to it
|
||||||
|
self.current_host = Some(localhost.clone());
|
||||||
|
// Find the actual index of localhost in the sorted list
|
||||||
|
self.host_index = self.available_hosts.iter().position(|h| h == &localhost).unwrap_or(0);
|
||||||
|
} else if self.current_host.is_none() {
|
||||||
|
// No current host - select first available (which is localhost if available)
|
||||||
|
self.current_host = Some(self.available_hosts[0].clone());
|
||||||
|
self.host_index = 0;
|
||||||
|
} else if let Some(ref current) = self.current_host {
|
||||||
|
if !self.available_hosts.contains(current) {
|
||||||
|
// Current host disconnected - select first available and reset navigation flag
|
||||||
|
self.current_host = Some(self.available_hosts[0].clone());
|
||||||
|
self.host_index = 0;
|
||||||
|
self.user_navigated_away = false; // Reset since we're forced to switch
|
||||||
|
} else if let Some(index) = self.available_hosts.iter().position(|h| h == current) {
|
||||||
|
// Update index for current host
|
||||||
|
self.host_index = index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle keyboard input
|
||||||
|
pub fn handle_input(&mut self, event: Event) -> Result<Option<UiCommand>> {
|
||||||
|
if let Event::Key(key) = event {
|
||||||
|
match key.code {
|
||||||
|
KeyCode::Char('q') => {
|
||||||
|
self.should_quit = true;
|
||||||
|
}
|
||||||
|
KeyCode::Left => {
|
||||||
|
self.navigate_host(-1);
|
||||||
|
}
|
||||||
|
KeyCode::Right => {
|
||||||
|
self.navigate_host(1);
|
||||||
|
}
|
||||||
|
KeyCode::Char('r') => {
|
||||||
|
// System rebuild command - works on any panel for current host
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
// Create command that shows CM Dashboard logo and then rebuilds
|
||||||
|
let logo_and_rebuild = format!(
|
||||||
|
r"cat << 'EOF'
|
||||||
|
NixOS System Rebuild
|
||||||
|
Target: {}
|
||||||
|
|
||||||
|
EOF
|
||||||
|
ssh -tt {}@{} 'bash -ic {}'",
|
||||||
|
hostname,
|
||||||
|
self.config.ssh.rebuild_user,
|
||||||
|
hostname,
|
||||||
|
self.config.ssh.rebuild_alias
|
||||||
|
);
|
||||||
|
|
||||||
|
std::process::Command::new("tmux")
|
||||||
|
.arg("display-popup")
|
||||||
|
.arg("-w")
|
||||||
|
.arg("80%")
|
||||||
|
.arg("-h")
|
||||||
|
.arg("80%")
|
||||||
|
.arg(&logo_and_rebuild)
|
||||||
|
.spawn()
|
||||||
|
.ok(); // Ignore errors, tmux will handle them
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Char('s') => {
|
||||||
|
// Service start command
|
||||||
|
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||||
|
if self.start_command(&hostname, CommandType::ServiceStart, service_name.clone()) {
|
||||||
|
return Ok(Some(UiCommand::ServiceStart { hostname, service_name }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Char('S') => {
|
||||||
|
// Service stop command
|
||||||
|
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||||
|
if self.start_command(&hostname, CommandType::ServiceStop, service_name.clone()) {
|
||||||
|
return Ok(Some(UiCommand::ServiceStop { hostname, service_name }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Char('J') => {
|
||||||
|
// Show service logs via journalctl in tmux popup
|
||||||
|
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||||
|
let journalctl_command = format!(
|
||||||
|
"ssh -tt {}@{} 'journalctl -u {}.service -f --no-pager -n 50'",
|
||||||
|
self.config.ssh.rebuild_user,
|
||||||
|
hostname,
|
||||||
|
service_name
|
||||||
|
);
|
||||||
|
|
||||||
|
std::process::Command::new("tmux")
|
||||||
|
.arg("display-popup")
|
||||||
|
.arg("-w")
|
||||||
|
.arg("80%")
|
||||||
|
.arg("-h")
|
||||||
|
.arg("80%")
|
||||||
|
.arg("-T")
|
||||||
|
.arg(format!("Logs: {}", service_name))
|
||||||
|
.arg(&journalctl_command)
|
||||||
|
.spawn()
|
||||||
|
.ok(); // Ignore errors, tmux will handle them
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Char('L') => {
|
||||||
|
// Show custom service log file in tmux popup
|
||||||
|
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||||
|
// Check if this service has a custom log file configured
|
||||||
|
if let Some(host_logs) = self.config.service_logs.get(&hostname) {
|
||||||
|
if let Some(log_config) = host_logs.iter().find(|config| config.service_name == service_name) {
|
||||||
|
let tail_command = format!(
|
||||||
|
"ssh -tt {}@{} 'tail -f {}'",
|
||||||
|
self.config.ssh.rebuild_user,
|
||||||
|
hostname,
|
||||||
|
log_config.log_file_path
|
||||||
|
);
|
||||||
|
|
||||||
|
std::process::Command::new("tmux")
|
||||||
|
.arg("display-popup")
|
||||||
|
.arg("-w")
|
||||||
|
.arg("80%")
|
||||||
|
.arg("-h")
|
||||||
|
.arg("80%")
|
||||||
|
.arg("-T")
|
||||||
|
.arg(format!("Custom Log: {}", service_name))
|
||||||
|
.arg(&tail_command)
|
||||||
|
.spawn()
|
||||||
|
.ok(); // Ignore errors, tmux will handle them
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Char('b') => {
|
||||||
|
// Trigger backup
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
self.start_command(&hostname, CommandType::BackupTrigger, hostname.clone());
|
||||||
|
return Ok(Some(UiCommand::TriggerBackup { hostname }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Tab => {
|
||||||
|
// Tab cycles to next host
|
||||||
|
self.navigate_host(1);
|
||||||
|
}
|
||||||
|
KeyCode::Up | KeyCode::Char('k') => {
|
||||||
|
// Move service selection up
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.services_widget.select_previous();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
KeyCode::Down | KeyCode::Char('j') => {
|
||||||
|
// Move service selection down
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
let total_services = {
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.services_widget.get_total_services_count()
|
||||||
|
};
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.services_widget.select_next(total_services);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Navigate between hosts
|
||||||
|
fn navigate_host(&mut self, direction: i32) {
|
||||||
|
if self.available_hosts.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let len = self.available_hosts.len();
|
||||||
|
if direction > 0 {
|
||||||
|
self.host_index = (self.host_index + 1) % len;
|
||||||
|
} else {
|
||||||
|
self.host_index = if self.host_index == 0 {
|
||||||
|
len - 1
|
||||||
|
} else {
|
||||||
|
self.host_index - 1
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
self.current_host = Some(self.available_hosts[self.host_index].clone());
|
||||||
|
|
||||||
|
// Check if user navigated away from localhost
|
||||||
|
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||||
|
if let Some(ref current) = self.current_host {
|
||||||
|
if current != &localhost {
|
||||||
|
self.user_navigated_away = true;
|
||||||
|
} else {
|
||||||
|
self.user_navigated_away = false; // User navigated back to localhost
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Switched to host: {}", self.current_host.as_ref().unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Get the currently selected service name from the services widget
|
||||||
|
fn get_selected_service(&self) -> Option<String> {
|
||||||
|
if let Some(hostname) = &self.current_host {
|
||||||
|
if let Some(host_widgets) = self.host_widgets.get(hostname) {
|
||||||
|
return host_widgets.services_widget.get_selected_service();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Should quit application
|
||||||
|
pub fn should_quit(&self) -> bool {
|
||||||
|
self.should_quit
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current service status for state-aware command validation
|
||||||
|
fn get_current_service_status(&self, hostname: &str, service_name: &str) -> Option<String> {
|
||||||
|
if let Some(host_widgets) = self.host_widgets.get(hostname) {
|
||||||
|
return host_widgets.services_widget.get_service_status(service_name);
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start command execution with immediate visual feedback
|
||||||
|
pub fn start_command(&mut self, hostname: &str, command_type: CommandType, target: String) -> bool {
|
||||||
|
// Get current service status to validate command
|
||||||
|
let current_status = self.get_current_service_status(hostname, &target);
|
||||||
|
|
||||||
|
// Validate if command makes sense for current state
|
||||||
|
let should_execute = match (&command_type, current_status.as_deref()) {
|
||||||
|
(CommandType::ServiceStart, Some("inactive") | Some("failed") | Some("dead")) => true,
|
||||||
|
(CommandType::ServiceStop, Some("active")) => true,
|
||||||
|
(CommandType::ServiceStart, Some("active")) => {
|
||||||
|
// Already running - don't execute
|
||||||
|
false
|
||||||
|
},
|
||||||
|
(CommandType::ServiceStop, Some("inactive") | Some("failed") | Some("dead")) => {
|
||||||
|
// Already stopped - don't execute
|
||||||
|
false
|
||||||
|
},
|
||||||
|
(_, None) => {
|
||||||
|
// Unknown service state - allow command to proceed
|
||||||
|
true
|
||||||
|
},
|
||||||
|
_ => true, // Default: allow other combinations
|
||||||
|
};
|
||||||
|
|
||||||
|
// ALWAYS store the pending transition for immediate visual feedback, even if we don't execute
|
||||||
|
if let Some(host_widgets) = self.host_widgets.get_mut(hostname) {
|
||||||
|
host_widgets.pending_service_transitions.insert(
|
||||||
|
target.clone(),
|
||||||
|
(command_type, current_status.unwrap_or_else(|| "unknown".to_string()), Instant::now())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
should_execute
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear pending transitions when real status updates arrive or timeout
|
||||||
|
fn clear_completed_transitions(&mut self, hostname: &str, service_metrics: &[&Metric]) {
|
||||||
|
if let Some(host_widgets) = self.host_widgets.get_mut(hostname) {
|
||||||
|
let mut completed_services = Vec::new();
|
||||||
|
|
||||||
|
// Check each pending transition to see if real status has changed
|
||||||
|
for (service_name, (command_type, original_status, _start_time)) in &host_widgets.pending_service_transitions {
|
||||||
|
|
||||||
|
// Look for status metric for this service
|
||||||
|
for metric in service_metrics {
|
||||||
|
if metric.name == format!("service_{}_status", service_name) {
|
||||||
|
let new_status = metric.value.as_string();
|
||||||
|
|
||||||
|
// Check if status has changed from original (command completed)
|
||||||
|
if &new_status != original_status {
|
||||||
|
// Verify it changed in the expected direction
|
||||||
|
let expected_change = match command_type {
|
||||||
|
CommandType::ServiceStart => &new_status == "active",
|
||||||
|
CommandType::ServiceStop => &new_status != "active",
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
if expected_change {
|
||||||
|
completed_services.push(service_name.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove completed transitions
|
||||||
|
for service_name in completed_services {
|
||||||
|
host_widgets.pending_service_transitions.remove(&service_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Render the dashboard (real btop-style multi-panel layout)
|
||||||
|
pub fn render(&mut self, frame: &mut Frame, metric_store: &MetricStore) {
|
||||||
|
let size = frame.size();
|
||||||
|
|
||||||
|
// Clear background to true black like btop
|
||||||
|
frame.render_widget(
|
||||||
|
Block::default().style(Style::default().bg(Theme::background())),
|
||||||
|
size,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create real btop-style layout: multi-panel with borders
|
||||||
|
// Three-section layout: title bar, main content, statusbar
|
||||||
|
let main_chunks = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Length(1), // Title bar
|
||||||
|
Constraint::Min(0), // Main content area
|
||||||
|
Constraint::Length(1), // Statusbar
|
||||||
|
])
|
||||||
|
.split(size);
|
||||||
|
|
||||||
|
// New layout: left panels | right services (100% height)
|
||||||
|
let content_chunks = ratatui::layout::Layout::default()
|
||||||
|
.direction(Direction::Horizontal)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Percentage(ThemeLayout::LEFT_PANEL_WIDTH), // Left side: system, backup
|
||||||
|
Constraint::Percentage(ThemeLayout::RIGHT_PANEL_WIDTH), // Right side: services (100% height)
|
||||||
|
])
|
||||||
|
.split(main_chunks[1]); // main_chunks[1] is now the content area (between title and statusbar)
|
||||||
|
|
||||||
|
// Check if backup panel should be shown
|
||||||
|
let show_backup = if let Some(hostname) = self.current_host.clone() {
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.backup_widget.has_data()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
|
// Left side: dynamic layout based on backup data availability
|
||||||
|
let left_chunks = if show_backup {
|
||||||
|
// Show both system and backup panels
|
||||||
|
ratatui::layout::Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Percentage(ThemeLayout::SYSTEM_PANEL_HEIGHT), // System section
|
||||||
|
Constraint::Percentage(ThemeLayout::BACKUP_PANEL_HEIGHT), // Backup section
|
||||||
|
])
|
||||||
|
.split(content_chunks[0])
|
||||||
|
} else {
|
||||||
|
// Show only system panel (full height)
|
||||||
|
ratatui::layout::Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([Constraint::Percentage(100)]) // System section takes full height
|
||||||
|
.split(content_chunks[0])
|
||||||
|
};
|
||||||
|
|
||||||
|
// Render title bar
|
||||||
|
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||||
|
|
||||||
|
// Render new panel layout
|
||||||
|
self.render_system_panel(frame, left_chunks[0], metric_store);
|
||||||
|
if show_backup && left_chunks.len() > 1 {
|
||||||
|
self.render_backup_panel(frame, left_chunks[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render services widget for current host
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
let is_focused = true; // Always show service selection
|
||||||
|
let (scroll_offset, pending_transitions) = {
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
(host_widgets.services_scroll_offset, host_widgets.pending_service_transitions.clone())
|
||||||
|
};
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets
|
||||||
|
.services_widget
|
||||||
|
.render_with_transitions(frame, content_chunks[1], is_focused, scroll_offset, &pending_transitions); // Services takes full right side
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render statusbar at the bottom
|
||||||
|
self.render_statusbar(frame, main_chunks[2]); // main_chunks[2] is the statusbar area
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render btop-style minimal title with host status colors
|
||||||
|
fn render_btop_title(&self, frame: &mut Frame, area: Rect, metric_store: &MetricStore) {
|
||||||
|
use ratatui::style::Modifier;
|
||||||
|
use ratatui::text::{Line, Span};
|
||||||
|
use theme::StatusIcons;
|
||||||
|
|
||||||
|
if self.available_hosts.is_empty() {
|
||||||
|
let title_text = "cm-dashboard • no hosts discovered";
|
||||||
|
let title = Paragraph::new(title_text)
|
||||||
|
.style(Style::default().fg(Theme::background()).bg(Theme::status_color(Status::Unknown)));
|
||||||
|
frame.render_widget(title, area);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate worst-case status across all hosts
|
||||||
|
let mut worst_status = Status::Ok;
|
||||||
|
for host in &self.available_hosts {
|
||||||
|
let host_status = self.calculate_host_status(host, metric_store);
|
||||||
|
worst_status = Status::aggregate(&[worst_status, host_status]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the worst status color as background
|
||||||
|
let background_color = Theme::status_color(worst_status);
|
||||||
|
|
||||||
|
// Split the title bar into left and right sections
|
||||||
|
let chunks = Layout::default()
|
||||||
|
.direction(Direction::Horizontal)
|
||||||
|
.constraints([Constraint::Length(15), Constraint::Min(0)])
|
||||||
|
.split(area);
|
||||||
|
|
||||||
|
// Left side: "cm-dashboard" text
|
||||||
|
let left_span = Span::styled(
|
||||||
|
" cm-dashboard",
|
||||||
|
Style::default().fg(Theme::background()).bg(background_color).add_modifier(Modifier::BOLD)
|
||||||
|
);
|
||||||
|
let left_title = Paragraph::new(Line::from(vec![left_span]))
|
||||||
|
.style(Style::default().bg(background_color));
|
||||||
|
frame.render_widget(left_title, chunks[0]);
|
||||||
|
|
||||||
|
// Right side: hosts with status indicators
|
||||||
|
let mut host_spans = Vec::new();
|
||||||
|
|
||||||
|
for (i, host) in self.available_hosts.iter().enumerate() {
|
||||||
|
if i > 0 {
|
||||||
|
host_spans.push(Span::styled(
|
||||||
|
" ",
|
||||||
|
Style::default().fg(Theme::background()).bg(background_color)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always show normal status icon based on metrics (no command status at host level)
|
||||||
|
let host_status = self.calculate_host_status(host, metric_store);
|
||||||
|
let status_icon = StatusIcons::get_icon(host_status);
|
||||||
|
|
||||||
|
// Add status icon with background color as foreground against status background
|
||||||
|
host_spans.push(Span::styled(
|
||||||
|
format!("{} ", status_icon),
|
||||||
|
Style::default().fg(Theme::background()).bg(background_color),
|
||||||
|
));
|
||||||
|
|
||||||
|
if Some(host) == self.current_host.as_ref() {
|
||||||
|
// Selected host in bold background color against status background
|
||||||
|
host_spans.push(Span::styled(
|
||||||
|
host.clone(),
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::background())
|
||||||
|
.bg(background_color)
|
||||||
|
.add_modifier(Modifier::BOLD),
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
// Other hosts in normal background color against status background
|
||||||
|
host_spans.push(Span::styled(
|
||||||
|
host.clone(),
|
||||||
|
Style::default().fg(Theme::background()).bg(background_color),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add right padding
|
||||||
|
host_spans.push(Span::styled(
|
||||||
|
" ",
|
||||||
|
Style::default().fg(Theme::background()).bg(background_color)
|
||||||
|
));
|
||||||
|
|
||||||
|
let host_line = Line::from(host_spans);
|
||||||
|
let host_title = Paragraph::new(vec![host_line])
|
||||||
|
.style(Style::default().bg(background_color))
|
||||||
|
.alignment(ratatui::layout::Alignment::Right);
|
||||||
|
frame.render_widget(host_title, chunks[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate overall status for a host based on its metrics
|
||||||
|
fn calculate_host_status(&self, hostname: &str, metric_store: &MetricStore) -> Status {
|
||||||
|
let metrics = metric_store.get_metrics_for_host(hostname);
|
||||||
|
|
||||||
|
if metrics.is_empty() {
|
||||||
|
return Status::Unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
// First check if we have the aggregated host status summary from the agent
|
||||||
|
if let Some(host_summary_metric) = metric_store.get_metric(hostname, "host_status_summary") {
|
||||||
|
return host_summary_metric.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to old aggregation logic with proper Pending handling
|
||||||
|
let mut has_critical = false;
|
||||||
|
let mut has_warning = false;
|
||||||
|
let mut has_pending = false;
|
||||||
|
let mut ok_count = 0;
|
||||||
|
|
||||||
|
for metric in &metrics {
|
||||||
|
match metric.status {
|
||||||
|
Status::Critical => has_critical = true,
|
||||||
|
Status::Warning => has_warning = true,
|
||||||
|
Status::Pending => has_pending = true,
|
||||||
|
Status::Ok => ok_count += 1,
|
||||||
|
Status::Unknown => {} // Ignore unknown for aggregation
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Priority order: Critical > Warning > Pending > Ok > Unknown
|
||||||
|
if has_critical {
|
||||||
|
Status::Critical
|
||||||
|
} else if has_warning {
|
||||||
|
Status::Warning
|
||||||
|
} else if has_pending {
|
||||||
|
Status::Pending
|
||||||
|
} else if ok_count > 0 {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Unknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render dynamic statusbar with context-aware shortcuts
|
||||||
|
fn render_statusbar(&self, frame: &mut Frame, area: Rect) {
|
||||||
|
let shortcuts = self.get_context_shortcuts();
|
||||||
|
let statusbar_text = shortcuts.join(" • ");
|
||||||
|
|
||||||
|
let statusbar = Paragraph::new(statusbar_text)
|
||||||
|
.style(Typography::secondary())
|
||||||
|
.alignment(ratatui::layout::Alignment::Center);
|
||||||
|
|
||||||
|
frame.render_widget(statusbar, area);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get context-aware shortcuts based on focused panel
|
||||||
|
fn get_context_shortcuts(&self) -> Vec<String> {
|
||||||
|
let mut shortcuts = Vec::new();
|
||||||
|
|
||||||
|
// Global shortcuts
|
||||||
|
shortcuts.push("Tab: Host".to_string());
|
||||||
|
shortcuts.push("↑↓/jk: Select".to_string());
|
||||||
|
shortcuts.push("r: Rebuild".to_string());
|
||||||
|
shortcuts.push("s/S: Start/Stop".to_string());
|
||||||
|
shortcuts.push("J: Logs".to_string());
|
||||||
|
shortcuts.push("L: Custom".to_string());
|
||||||
|
|
||||||
|
// Always show quit
|
||||||
|
shortcuts.push("q: Quit".to_string());
|
||||||
|
|
||||||
|
shortcuts
|
||||||
|
}
|
||||||
|
|
||||||
|
fn render_system_panel(&mut self, frame: &mut Frame, area: Rect, _metric_store: &MetricStore) {
|
||||||
|
let system_block = Components::widget_block("system");
|
||||||
|
let inner_area = system_block.inner(area);
|
||||||
|
frame.render_widget(system_block, area);
|
||||||
|
// Get current host widgets, create if none exist
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
let scroll_offset = {
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.system_scroll_offset
|
||||||
|
};
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.system_widget.render_with_scroll(frame, inner_area, scroll_offset, &hostname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn render_backup_panel(&mut self, frame: &mut Frame, area: Rect) {
|
||||||
|
let backup_block = Components::widget_block("backup");
|
||||||
|
let inner_area = backup_block.inner(area);
|
||||||
|
frame.render_widget(backup_block, area);
|
||||||
|
|
||||||
|
// Get current host widgets for backup widget
|
||||||
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
|
let scroll_offset = {
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.backup_scroll_offset
|
||||||
|
};
|
||||||
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
host_widgets.backup_widget.render_with_scroll(frame, inner_area, scroll_offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,205 +0,0 @@
|
|||||||
use ratatui::layout::Rect;
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
use crate::app::HostDisplayData;
|
|
||||||
use crate::data::metrics::ServiceStatus;
|
|
||||||
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel};
|
|
||||||
use crate::app::ConnectionStatus;
|
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
|
||||||
match host {
|
|
||||||
Some(data) => {
|
|
||||||
match (&data.connection_status, data.services.as_ref()) {
|
|
||||||
(ConnectionStatus::Connected, Some(metrics)) => {
|
|
||||||
render_metrics(frame, data, metrics, area);
|
|
||||||
}
|
|
||||||
(ConnectionStatus::Connected, None) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"Services",
|
|
||||||
&format!("Host {} has no service metrics yet", data.name),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
(status, _) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"Services",
|
|
||||||
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => render_placeholder(frame, area, "Services", "No hosts configured"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_metrics(
|
|
||||||
frame: &mut Frame,
|
|
||||||
_host: &HostDisplayData,
|
|
||||||
metrics: &crate::data::metrics::ServiceMetrics,
|
|
||||||
area: Rect,
|
|
||||||
) {
|
|
||||||
let summary = &metrics.summary;
|
|
||||||
let title = "Services".to_string();
|
|
||||||
|
|
||||||
// Use agent-calculated services status
|
|
||||||
let widget_status = status_level_from_agent_status(summary.services_status.as_ref());
|
|
||||||
|
|
||||||
let mut data = WidgetData::new(
|
|
||||||
title,
|
|
||||||
Some(WidgetStatus::new(widget_status)),
|
|
||||||
vec!["Service".to_string(), "RAM".to_string(), "CPU".to_string(), "Disk".to_string()]
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
if metrics.services.is_empty() {
|
|
||||||
data.add_row(
|
|
||||||
None,
|
|
||||||
vec![],
|
|
||||||
vec![
|
|
||||||
"No services reported".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
render_widget_data(frame, area, data);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut services = metrics.services.clone();
|
|
||||||
services.sort_by(|a, b| {
|
|
||||||
// First, determine the primary service name for grouping
|
|
||||||
let primary_a = a.sub_service.as_ref().unwrap_or(&a.name);
|
|
||||||
let primary_b = b.sub_service.as_ref().unwrap_or(&b.name);
|
|
||||||
|
|
||||||
// Sort by primary service name first
|
|
||||||
match primary_a.cmp(primary_b) {
|
|
||||||
std::cmp::Ordering::Equal => {
|
|
||||||
// Same primary service, put parent service first, then sub-services alphabetically
|
|
||||||
match (a.sub_service.as_ref(), b.sub_service.as_ref()) {
|
|
||||||
(None, Some(_)) => std::cmp::Ordering::Less, // Parent comes before sub-services
|
|
||||||
(Some(_), None) => std::cmp::Ordering::Greater, // Sub-services come after parent
|
|
||||||
_ => a.name.cmp(&b.name), // Both same type, sort by name
|
|
||||||
}
|
|
||||||
}
|
|
||||||
other => other, // Different primary services, sort alphabetically
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for svc in services {
|
|
||||||
let status_level = match svc.status {
|
|
||||||
ServiceStatus::Running => StatusLevel::Ok,
|
|
||||||
ServiceStatus::Degraded => StatusLevel::Warning,
|
|
||||||
ServiceStatus::Restarting => StatusLevel::Warning,
|
|
||||||
ServiceStatus::Stopped => StatusLevel::Error,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Service row with optional description(s)
|
|
||||||
let description = if let Some(desc_vec) = &svc.description {
|
|
||||||
desc_vec.clone()
|
|
||||||
} else {
|
|
||||||
vec![]
|
|
||||||
};
|
|
||||||
|
|
||||||
if svc.sub_service.is_some() {
|
|
||||||
// Sub-services (nginx sites) only show name and status, no memory/CPU/disk data
|
|
||||||
// Add latency information for nginx sites if available
|
|
||||||
let service_name_with_latency = if let Some(parent) = &svc.sub_service {
|
|
||||||
if parent == "nginx" {
|
|
||||||
// Extract subdomain part for shorter display
|
|
||||||
let short_name = if let Some(dot_pos) = svc.name.find('.') {
|
|
||||||
&svc.name[..dot_pos]
|
|
||||||
} else {
|
|
||||||
&svc.name
|
|
||||||
};
|
|
||||||
|
|
||||||
match &svc.latency_ms {
|
|
||||||
Some(latency) if *latency >= 2000.0 => format!("{} → unreachable", short_name), // Timeout (2s+)
|
|
||||||
Some(latency) => format!("{} → {:.0}ms", short_name, latency),
|
|
||||||
None => format!("{} → unreachable", short_name), // Connection failed
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
svc.name.clone()
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
svc.name.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
data.add_row_with_sub_service(
|
|
||||||
Some(WidgetStatus::new(status_level)),
|
|
||||||
description,
|
|
||||||
vec![
|
|
||||||
service_name_with_latency,
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
],
|
|
||||||
svc.sub_service.clone(),
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Regular services show all columns
|
|
||||||
data.add_row(
|
|
||||||
Some(WidgetStatus::new(status_level)),
|
|
||||||
description,
|
|
||||||
vec![
|
|
||||||
svc.name.clone(),
|
|
||||||
format_memory_value(svc.memory_used_mb, svc.memory_quota_mb),
|
|
||||||
format_cpu_value(svc.cpu_percent),
|
|
||||||
format_disk_value(svc.disk_used_gb, svc.disk_quota_gb),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
render_widget_data(frame, area, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
fn format_bytes(mb: f32) -> String {
|
|
||||||
if mb < 0.1 {
|
|
||||||
"<1MB".to_string()
|
|
||||||
} else if mb < 1.0 {
|
|
||||||
format!("{:.0}kB", mb * 1000.0)
|
|
||||||
} else if mb < 1000.0 {
|
|
||||||
format!("{:.0}MB", mb)
|
|
||||||
} else {
|
|
||||||
format!("{:.1}GB", mb / 1000.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_memory_value(used: f32, quota: f32) -> String {
|
|
||||||
let used_value = format_bytes(used);
|
|
||||||
|
|
||||||
if quota > 0.05 {
|
|
||||||
let quota_gb = quota / 1000.0;
|
|
||||||
// Format quota without decimals and use GB
|
|
||||||
format!("{} ({}GB)", used_value, quota_gb as u32)
|
|
||||||
} else {
|
|
||||||
used_value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_cpu_value(cpu_percent: f32) -> String {
|
|
||||||
if cpu_percent >= 0.1 {
|
|
||||||
format!("{:.1}%", cpu_percent)
|
|
||||||
} else {
|
|
||||||
"0.0%".to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_disk_value(used: f32, quota: f32) -> String {
|
|
||||||
let used_value = format_bytes(used * 1000.0); // Convert GB to MB for format_bytes
|
|
||||||
|
|
||||||
if quota > 0.05 {
|
|
||||||
// Format quota without decimals and use GB (round to nearest GB)
|
|
||||||
format!("{} ({}GB)", used_value, quota.round() as u32)
|
|
||||||
} else {
|
|
||||||
used_value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,142 +0,0 @@
|
|||||||
use ratatui::layout::Rect;
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
use crate::app::HostDisplayData;
|
|
||||||
use crate::data::metrics::SmartMetrics;
|
|
||||||
use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel};
|
|
||||||
use crate::app::ConnectionStatus;
|
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
|
||||||
match host {
|
|
||||||
Some(data) => {
|
|
||||||
match (&data.connection_status, data.smart.as_ref()) {
|
|
||||||
(ConnectionStatus::Connected, Some(metrics)) => {
|
|
||||||
render_metrics(frame, data, metrics, area);
|
|
||||||
}
|
|
||||||
(ConnectionStatus::Connected, None) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"Storage",
|
|
||||||
&format!("Host {} has no SMART data yet", data.name),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
(status, _) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"Storage",
|
|
||||||
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => render_placeholder(frame, area, "Storage", "No hosts configured"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &SmartMetrics, area: Rect) {
|
|
||||||
let title = "Storage".to_string();
|
|
||||||
|
|
||||||
let widget_status = status_level_from_agent_status(Some(&metrics.status));
|
|
||||||
|
|
||||||
let mut data = WidgetData::new(
|
|
||||||
title,
|
|
||||||
Some(WidgetStatus::new(widget_status)),
|
|
||||||
vec!["Name".to_string(), "Temp".to_string(), "Wear".to_string(), "Usage".to_string()]
|
|
||||||
);
|
|
||||||
|
|
||||||
if metrics.drives.is_empty() {
|
|
||||||
data.add_row(
|
|
||||||
None,
|
|
||||||
vec![],
|
|
||||||
vec![
|
|
||||||
"No drives reported".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
"".to_string(),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
for drive in &metrics.drives {
|
|
||||||
let status_level = drive_status_level(metrics, &drive.name);
|
|
||||||
|
|
||||||
// Use agent-provided descriptions (agent is source of truth)
|
|
||||||
let mut description = drive.description.clone().unwrap_or_default();
|
|
||||||
|
|
||||||
// Add drive-specific issues as additional description lines
|
|
||||||
for issue in &metrics.issues {
|
|
||||||
if issue.to_lowercase().contains(&drive.name.to_lowercase()) {
|
|
||||||
description.push(format!("Issue: {}", issue));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data.add_row(
|
|
||||||
Some(WidgetStatus::new(status_level)),
|
|
||||||
description,
|
|
||||||
vec![
|
|
||||||
drive.name.clone(),
|
|
||||||
format_temperature(drive.temperature_c),
|
|
||||||
format_percent(drive.wear_level),
|
|
||||||
format_usage(drive.used_gb, drive.capacity_gb),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
render_widget_data(frame, area, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
fn format_temperature(value: f32) -> String {
|
|
||||||
if value.abs() < f32::EPSILON {
|
|
||||||
"—".to_string()
|
|
||||||
} else {
|
|
||||||
format!("{:.0}°C", value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_percent(value: f32) -> String {
|
|
||||||
if value.abs() < f32::EPSILON {
|
|
||||||
"—".to_string()
|
|
||||||
} else {
|
|
||||||
format!("{:.0}%", value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
fn format_usage(used: Option<f32>, capacity: Option<f32>) -> String {
|
|
||||||
match (used, capacity) {
|
|
||||||
(Some(used_gb), Some(total_gb)) if used_gb > 0.0 && total_gb > 0.0 => {
|
|
||||||
format!("{:.0}GB ({:.0}GB)", used_gb, total_gb)
|
|
||||||
}
|
|
||||||
(Some(used_gb), None) if used_gb > 0.0 => {
|
|
||||||
format!("{:.0}GB", used_gb)
|
|
||||||
}
|
|
||||||
(None, Some(total_gb)) if total_gb > 0.0 => {
|
|
||||||
format!("— ({:.0}GB)", total_gb)
|
|
||||||
}
|
|
||||||
_ => "—".to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn drive_status_level(metrics: &SmartMetrics, drive_name: &str) -> StatusLevel {
|
|
||||||
if metrics.summary.critical > 0
|
|
||||||
|| metrics.issues.iter().any(|issue| {
|
|
||||||
issue.to_lowercase().contains(&drive_name.to_lowercase())
|
|
||||||
&& issue.to_lowercase().contains("fail")
|
|
||||||
})
|
|
||||||
{
|
|
||||||
StatusLevel::Error
|
|
||||||
} else if metrics.summary.warning > 0
|
|
||||||
|| metrics
|
|
||||||
.issues
|
|
||||||
.iter()
|
|
||||||
.any(|issue| issue.to_lowercase().contains(&drive_name.to_lowercase()))
|
|
||||||
{
|
|
||||||
StatusLevel::Warning
|
|
||||||
} else {
|
|
||||||
StatusLevel::Ok
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,124 +0,0 @@
|
|||||||
use ratatui::layout::Rect;
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
use crate::app::HostDisplayData;
|
|
||||||
use crate::data::metrics::SystemMetrics;
|
|
||||||
use crate::ui::widget::{
|
|
||||||
render_placeholder, render_combined_widget_data,
|
|
||||||
status_level_from_agent_status, connection_status_message, WidgetDataSet, WidgetStatus, StatusLevel,
|
|
||||||
};
|
|
||||||
use crate::app::ConnectionStatus;
|
|
||||||
|
|
||||||
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
|
|
||||||
match host {
|
|
||||||
Some(data) => {
|
|
||||||
match (&data.connection_status, data.system.as_ref()) {
|
|
||||||
(ConnectionStatus::Connected, Some(metrics)) => {
|
|
||||||
render_metrics(frame, data, metrics, area);
|
|
||||||
}
|
|
||||||
(ConnectionStatus::Connected, None) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"System",
|
|
||||||
&format!("Host {} awaiting system metrics", data.name),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
(status, _) => {
|
|
||||||
render_placeholder(
|
|
||||||
frame,
|
|
||||||
area,
|
|
||||||
"System",
|
|
||||||
&format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => render_placeholder(frame, area, "System", "No hosts configured"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_metrics(
|
|
||||||
frame: &mut Frame,
|
|
||||||
_host: &HostDisplayData,
|
|
||||||
metrics: &SystemMetrics,
|
|
||||||
area: Rect,
|
|
||||||
) {
|
|
||||||
let summary = &metrics.summary;
|
|
||||||
|
|
||||||
// Use agent-calculated statuses
|
|
||||||
let memory_status = status_level_from_agent_status(summary.memory_status.as_ref());
|
|
||||||
let cpu_status = status_level_from_agent_status(summary.cpu_status.as_ref());
|
|
||||||
|
|
||||||
// Determine overall widget status based on worst case from agent statuses
|
|
||||||
let overall_status_level = match (memory_status, cpu_status) {
|
|
||||||
(StatusLevel::Error, _) | (_, StatusLevel::Error) => StatusLevel::Error,
|
|
||||||
(StatusLevel::Warning, _) | (_, StatusLevel::Warning) => StatusLevel::Warning,
|
|
||||||
(StatusLevel::Ok, StatusLevel::Ok) => StatusLevel::Ok,
|
|
||||||
_ => StatusLevel::Unknown,
|
|
||||||
};
|
|
||||||
let overall_status = Some(WidgetStatus::new(overall_status_level));
|
|
||||||
|
|
||||||
// Single dataset with RAM, CPU load, CPU temp as columns
|
|
||||||
let mut system_dataset = WidgetDataSet::new(
|
|
||||||
vec!["RAM usage".to_string(), "CPU load".to_string(), "CPU temp".to_string()],
|
|
||||||
overall_status.clone()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Use agent-provided C-states and logged-in users as description
|
|
||||||
let mut description_lines = Vec::new();
|
|
||||||
|
|
||||||
// Add C-states with prefix on first line, indent subsequent lines
|
|
||||||
if let Some(cstates) = &summary.cpu_cstate {
|
|
||||||
for (i, cstate_line) in cstates.iter().enumerate() {
|
|
||||||
if i == 0 {
|
|
||||||
description_lines.push(format!("C-State: {}", cstate_line));
|
|
||||||
} else {
|
|
||||||
description_lines.push(format!(" {}", cstate_line));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add logged-in users to description
|
|
||||||
if let Some(users) = &summary.logged_in_users {
|
|
||||||
if !users.is_empty() {
|
|
||||||
let user_line = if users.len() == 1 {
|
|
||||||
format!("Logged in: {}", users[0])
|
|
||||||
} else {
|
|
||||||
format!("Logged in: {} users ({})", users.len(), users.join(", "))
|
|
||||||
};
|
|
||||||
description_lines.push(user_line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add top CPU process
|
|
||||||
if let Some(cpu_proc) = &summary.top_cpu_process {
|
|
||||||
description_lines.push(format!("Top CPU: {}", cpu_proc));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add top RAM process
|
|
||||||
if let Some(ram_proc) = &summary.top_ram_process {
|
|
||||||
description_lines.push(format!("Top RAM: {}", ram_proc));
|
|
||||||
}
|
|
||||||
|
|
||||||
system_dataset.add_row(
|
|
||||||
overall_status.clone(),
|
|
||||||
description_lines,
|
|
||||||
vec![
|
|
||||||
format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0),
|
|
||||||
format!("{:.2} • {:.2} • {:.2}", summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
|
|
||||||
format_optional_metric(summary.cpu_temp_c, "°C"),
|
|
||||||
],
|
|
||||||
);
|
|
||||||
|
|
||||||
// Render single dataset
|
|
||||||
render_combined_widget_data(frame, area, "System".to_string(), overall_status, vec![system_dataset]);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
|
|
||||||
match value {
|
|
||||||
Some(number) => format!("{:.1}{}", number, unit),
|
|
||||||
None => "—".to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
325
dashboard/src/ui/theme.rs
Normal file
325
dashboard/src/ui/theme.rs
Normal file
@@ -0,0 +1,325 @@
|
|||||||
|
use cm_dashboard_shared::Status;
|
||||||
|
use ratatui::style::{Color, Modifier, Style};
|
||||||
|
use ratatui::widgets::{Block, Borders};
|
||||||
|
|
||||||
|
/// Complete terminal color palette matching your configuration
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub struct TerminalColors {
|
||||||
|
// Primary colors
|
||||||
|
pub foreground: Color,
|
||||||
|
pub dim_foreground: Color,
|
||||||
|
pub bright_foreground: Color,
|
||||||
|
pub background: Color,
|
||||||
|
|
||||||
|
// Normal colors
|
||||||
|
pub normal_black: Color,
|
||||||
|
pub normal_red: Color,
|
||||||
|
pub normal_green: Color,
|
||||||
|
pub normal_yellow: Color,
|
||||||
|
pub normal_blue: Color,
|
||||||
|
pub normal_magenta: Color,
|
||||||
|
pub normal_cyan: Color,
|
||||||
|
pub normal_white: Color,
|
||||||
|
|
||||||
|
// Bright colors
|
||||||
|
pub bright_black: Color,
|
||||||
|
pub bright_red: Color,
|
||||||
|
pub bright_green: Color,
|
||||||
|
pub bright_yellow: Color,
|
||||||
|
pub bright_blue: Color,
|
||||||
|
pub bright_magenta: Color,
|
||||||
|
pub bright_cyan: Color,
|
||||||
|
pub bright_white: Color,
|
||||||
|
|
||||||
|
// Dim colors
|
||||||
|
pub dim_black: Color,
|
||||||
|
pub dim_red: Color,
|
||||||
|
pub dim_green: Color,
|
||||||
|
pub dim_yellow: Color,
|
||||||
|
pub dim_blue: Color,
|
||||||
|
pub dim_magenta: Color,
|
||||||
|
pub dim_cyan: Color,
|
||||||
|
pub dim_white: Color,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TerminalColors {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
// Primary colors
|
||||||
|
foreground: Color::Rgb(198, 198, 198), // #c6c6c6
|
||||||
|
dim_foreground: Color::Rgb(112, 112, 112), // #707070
|
||||||
|
bright_foreground: Color::Rgb(255, 255, 255), // #ffffff
|
||||||
|
background: Color::Rgb(38, 38, 38), // #262626
|
||||||
|
|
||||||
|
// Normal colors
|
||||||
|
normal_black: Color::Rgb(0, 0, 0), // #000000
|
||||||
|
normal_red: Color::Rgb(215, 84, 0), // #d75400
|
||||||
|
normal_green: Color::Rgb(175, 215, 135), // #afd787
|
||||||
|
normal_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||||
|
normal_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||||
|
normal_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||||
|
normal_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||||
|
normal_white: Color::Rgb(238, 238, 238), // #eeeeee
|
||||||
|
|
||||||
|
// Bright colors
|
||||||
|
bright_black: Color::Rgb(48, 48, 48), // #303030
|
||||||
|
bright_red: Color::Rgb(215, 84, 0), // #d75400
|
||||||
|
bright_green: Color::Rgb(175, 215, 135), // #afd787
|
||||||
|
bright_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||||
|
bright_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||||
|
bright_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||||
|
bright_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||||
|
bright_white: Color::Rgb(255, 255, 255), // #ffffff
|
||||||
|
|
||||||
|
// Dim colors
|
||||||
|
dim_black: Color::Rgb(0, 0, 0), // #000000
|
||||||
|
dim_red: Color::Rgb(215, 84, 0), // #d75400
|
||||||
|
dim_green: Color::Rgb(175, 215, 135), // #afd787
|
||||||
|
dim_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||||
|
dim_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||||
|
dim_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||||
|
dim_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||||
|
dim_white: Color::Rgb(221, 221, 221), // #dddddd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Comprehensive theming engine for dashboard consistency
|
||||||
|
pub struct Theme;
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
impl Theme {
|
||||||
|
fn colors() -> &'static TerminalColors {
|
||||||
|
static COLORS: std::sync::OnceLock<TerminalColors> = std::sync::OnceLock::new();
|
||||||
|
COLORS.get_or_init(TerminalColors::default)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Semantic color mapping using the terminal color struct
|
||||||
|
pub fn primary_text() -> Color {
|
||||||
|
Self::colors().normal_white
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn secondary_text() -> Color {
|
||||||
|
Self::colors().foreground
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn muted_text() -> Color {
|
||||||
|
Self::colors().dim_foreground
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn border() -> Color {
|
||||||
|
Self::colors().dim_foreground
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn border_title() -> Color {
|
||||||
|
Self::colors().bright_white
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn background() -> Color {
|
||||||
|
Self::colors().background
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn success() -> Color {
|
||||||
|
Self::colors().normal_green
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn warning() -> Color {
|
||||||
|
Self::colors().normal_yellow
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn error() -> Color {
|
||||||
|
Self::colors().normal_red
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn info() -> Color {
|
||||||
|
Self::colors().normal_cyan
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn highlight() -> Color {
|
||||||
|
Self::colors().normal_blue
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get color for status level
|
||||||
|
pub fn status_color(status: Status) -> Color {
|
||||||
|
match status {
|
||||||
|
Status::Ok => Self::success(),
|
||||||
|
Status::Pending => Self::highlight(), // Blue for pending
|
||||||
|
Status::Warning => Self::warning(),
|
||||||
|
Status::Critical => Self::error(),
|
||||||
|
Status::Unknown => Self::muted_text(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get style for status level
|
||||||
|
pub fn status_style(status: Status) -> Style {
|
||||||
|
Style::default().fg(Self::status_color(status))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CPU usage colors using terminal color struct
|
||||||
|
pub fn cpu_color(percentage: u16) -> Color {
|
||||||
|
match percentage {
|
||||||
|
0..=25 => Self::colors().normal_green, // Low usage
|
||||||
|
26..=50 => Self::colors().normal_yellow, // Medium usage
|
||||||
|
51..=75 => Self::colors().normal_magenta, // High usage
|
||||||
|
76..=100 => Self::colors().normal_red, // Critical usage
|
||||||
|
_ => Self::colors().normal_red, // Over 100%
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Memory usage colors using terminal color struct
|
||||||
|
pub fn memory_color(percentage: u16) -> Color {
|
||||||
|
match percentage {
|
||||||
|
0..=60 => Self::colors().normal_green, // Low usage
|
||||||
|
61..=80 => Self::colors().normal_yellow, // Medium usage
|
||||||
|
81..=95 => Self::colors().normal_magenta, // High usage
|
||||||
|
96..=100 => Self::colors().normal_red, // Critical usage
|
||||||
|
_ => Self::colors().normal_red, // Over 100%
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get gauge color based on percentage
|
||||||
|
pub fn gauge_color(percentage: u16, warning_threshold: u16, critical_threshold: u16) -> Color {
|
||||||
|
if percentage >= critical_threshold {
|
||||||
|
Self::error()
|
||||||
|
} else if percentage >= warning_threshold {
|
||||||
|
Self::warning()
|
||||||
|
} else {
|
||||||
|
Self::success()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Widget border style
|
||||||
|
pub fn widget_border_style() -> Style {
|
||||||
|
Style::default().fg(Self::border()).bg(Self::background())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inactive widget border style
|
||||||
|
pub fn widget_border_inactive_style() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Self::muted_text())
|
||||||
|
.bg(Self::background())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Title style
|
||||||
|
pub fn title_style() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Self::border_title())
|
||||||
|
.bg(Self::background())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Status bar style
|
||||||
|
pub fn status_bar_style() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Self::muted_text())
|
||||||
|
.bg(Self::background())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Layout and spacing constants
|
||||||
|
pub struct Layout;
|
||||||
|
|
||||||
|
impl Layout {
|
||||||
|
/// Left panel percentage (system + backup)
|
||||||
|
pub const LEFT_PANEL_WIDTH: u16 = 45;
|
||||||
|
/// Right panel percentage (services)
|
||||||
|
pub const RIGHT_PANEL_WIDTH: u16 = 55;
|
||||||
|
/// System vs backup split (equal)
|
||||||
|
pub const SYSTEM_PANEL_HEIGHT: u16 = 50;
|
||||||
|
pub const BACKUP_PANEL_HEIGHT: u16 = 50;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Typography system
|
||||||
|
pub struct Typography;
|
||||||
|
|
||||||
|
/// Component styling system
|
||||||
|
pub struct Components;
|
||||||
|
|
||||||
|
/// Status icons and styling
|
||||||
|
pub struct StatusIcons;
|
||||||
|
|
||||||
|
impl StatusIcons {
|
||||||
|
/// Get status icon symbol
|
||||||
|
pub fn get_icon(status: Status) -> &'static str {
|
||||||
|
match status {
|
||||||
|
Status::Ok => "●",
|
||||||
|
Status::Pending => "◉", // Hollow circle for pending
|
||||||
|
Status::Warning => "◐",
|
||||||
|
Status::Critical => "◯",
|
||||||
|
Status::Unknown => "?",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create spans with status icon colored and text in foreground color
|
||||||
|
pub fn create_status_spans(status: Status, text: &str) -> Vec<ratatui::text::Span<'static>> {
|
||||||
|
let icon = Self::get_icon(status);
|
||||||
|
let status_color = match status {
|
||||||
|
Status::Ok => Theme::success(), // Green
|
||||||
|
Status::Pending => Theme::highlight(), // Blue
|
||||||
|
Status::Warning => Theme::warning(), // Yellow
|
||||||
|
Status::Critical => Theme::error(), // Red
|
||||||
|
Status::Unknown => Theme::muted_text(), // Gray
|
||||||
|
};
|
||||||
|
|
||||||
|
vec![
|
||||||
|
ratatui::text::Span::styled(
|
||||||
|
format!("{} ", icon),
|
||||||
|
Style::default().fg(status_color).bg(Theme::background()),
|
||||||
|
),
|
||||||
|
ratatui::text::Span::styled(
|
||||||
|
text.to_string(),
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::secondary_text())
|
||||||
|
.bg(Theme::background()),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Components {
|
||||||
|
/// Standard widget block with title using bright foreground for title
|
||||||
|
pub fn widget_block(title: &str) -> Block<'_> {
|
||||||
|
Block::default()
|
||||||
|
.title(title)
|
||||||
|
.borders(Borders::ALL)
|
||||||
|
.style(Style::default().fg(Theme::border()).bg(Theme::background()))
|
||||||
|
.title_style(
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::border_title())
|
||||||
|
.bg(Theme::background()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Typography {
|
||||||
|
|
||||||
|
/// Widget title style (panel headers) - bold bright white
|
||||||
|
pub fn widget_title() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Color::White)
|
||||||
|
.bg(Theme::background())
|
||||||
|
.add_modifier(Modifier::BOLD)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Secondary content text
|
||||||
|
pub fn secondary() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::secondary_text())
|
||||||
|
.bg(Theme::background())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Muted text (inactive items, placeholders) - now bold bright white for headers
|
||||||
|
pub fn muted() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Color::White)
|
||||||
|
.bg(Theme::background())
|
||||||
|
.add_modifier(Modifier::BOLD)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tree symbols style (blue color)
|
||||||
|
pub fn tree() -> Style {
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::highlight())
|
||||||
|
.bg(Theme::background())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,527 +0,0 @@
|
|||||||
use ratatui::layout::{Constraint, Rect};
|
|
||||||
use ratatui::style::{Color, Modifier, Style};
|
|
||||||
use ratatui::text::{Line, Span};
|
|
||||||
use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap};
|
|
||||||
use ratatui::Frame;
|
|
||||||
|
|
||||||
|
|
||||||
pub fn heading_row_style() -> Style {
|
|
||||||
neutral_text_style().add_modifier(Modifier::BOLD)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn neutral_text_style() -> Style {
|
|
||||||
Style::default()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn neutral_title_span(title: &str) -> Span<'static> {
|
|
||||||
Span::styled(
|
|
||||||
title.to_string(),
|
|
||||||
neutral_text_style().add_modifier(Modifier::BOLD),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn neutral_border_style(color: Color) -> Style {
|
|
||||||
Style::default().fg(color)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub fn status_level_from_agent_status(agent_status: Option<&String>) -> StatusLevel {
|
|
||||||
match agent_status.map(|s| s.as_str()) {
|
|
||||||
Some("critical") => StatusLevel::Error,
|
|
||||||
Some("warning") => StatusLevel::Warning,
|
|
||||||
Some("ok") => StatusLevel::Ok,
|
|
||||||
Some("unknown") => StatusLevel::Unknown,
|
|
||||||
_ => StatusLevel::Unknown,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn connection_status_message(connection_status: &crate::app::ConnectionStatus, last_error: &Option<String>) -> String {
|
|
||||||
use crate::app::ConnectionStatus;
|
|
||||||
match connection_status {
|
|
||||||
ConnectionStatus::Connected => "Connected".to_string(),
|
|
||||||
ConnectionStatus::Timeout => {
|
|
||||||
if let Some(error) = last_error {
|
|
||||||
format!("Timeout: {}", error)
|
|
||||||
} else {
|
|
||||||
"Keep-alive timeout".to_string()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
ConnectionStatus::Error => {
|
|
||||||
if let Some(error) = last_error {
|
|
||||||
format!("Error: {}", error)
|
|
||||||
} else {
|
|
||||||
"Connection error".to_string()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
ConnectionStatus::Unknown => "No data received".to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pub fn render_placeholder(frame: &mut Frame, area: Rect, title: &str, message: &str) {
|
|
||||||
let block = Block::default()
|
|
||||||
.title(neutral_title_span(title))
|
|
||||||
.borders(Borders::ALL)
|
|
||||||
.border_style(neutral_border_style(Color::Gray));
|
|
||||||
|
|
||||||
let inner = block.inner(area);
|
|
||||||
frame.render_widget(block, area);
|
|
||||||
frame.render_widget(
|
|
||||||
Paragraph::new(Line::from(message))
|
|
||||||
.wrap(Wrap { trim: true })
|
|
||||||
.style(neutral_text_style()),
|
|
||||||
inner,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_last_sub_service_in_group(rows: &[WidgetRow], current_idx: usize, parent_service: &Option<String>) -> bool {
|
|
||||||
if let Some(parent) = parent_service {
|
|
||||||
// Look ahead to see if there are any more sub-services for this parent
|
|
||||||
for i in (current_idx + 1)..rows.len() {
|
|
||||||
if let Some(ref other_parent) = rows[i].sub_service {
|
|
||||||
if other_parent == parent {
|
|
||||||
return false; // Found another sub-service for same parent
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
true // No more sub-services found for this parent
|
|
||||||
} else {
|
|
||||||
false // Not a sub-service
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn render_widget_data(frame: &mut Frame, area: Rect, data: WidgetData) {
|
|
||||||
render_combined_widget_data(frame, area, data.title, data.status, vec![data.dataset]);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn render_combined_widget_data(frame: &mut Frame, area: Rect, title: String, status: Option<WidgetStatus>, datasets: Vec<WidgetDataSet>) {
|
|
||||||
if datasets.is_empty() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create border and title - determine color from widget status
|
|
||||||
let border_color = status.as_ref()
|
|
||||||
.map(|s| s.status.to_color())
|
|
||||||
.unwrap_or(Color::Reset);
|
|
||||||
let block = Block::default()
|
|
||||||
.title(neutral_title_span(&title))
|
|
||||||
.borders(Borders::ALL)
|
|
||||||
.border_style(neutral_border_style(border_color));
|
|
||||||
|
|
||||||
let inner = block.inner(area);
|
|
||||||
frame.render_widget(block, area);
|
|
||||||
|
|
||||||
// Split multi-row datasets into single-row datasets when wrapping is needed
|
|
||||||
let split_datasets = split_multirow_datasets_with_area(datasets, inner);
|
|
||||||
|
|
||||||
let mut current_y = inner.y;
|
|
||||||
|
|
||||||
for dataset in split_datasets.iter() {
|
|
||||||
if current_y >= inner.y + inner.height {
|
|
||||||
break; // No more space
|
|
||||||
}
|
|
||||||
|
|
||||||
current_y += render_dataset_with_wrapping(frame, dataset, inner, current_y);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn split_multirow_datasets_with_area(datasets: Vec<WidgetDataSet>, inner: Rect) -> Vec<WidgetDataSet> {
|
|
||||||
let mut result = Vec::new();
|
|
||||||
|
|
||||||
for dataset in datasets {
|
|
||||||
if dataset.rows.len() <= 1 {
|
|
||||||
// Single row or empty - keep as is
|
|
||||||
result.push(dataset);
|
|
||||||
} else {
|
|
||||||
// Multiple rows - check if wrapping is needed using actual available width
|
|
||||||
if dataset_needs_wrapping_with_width(&dataset, inner.width) {
|
|
||||||
// Split into separate datasets for individual wrapping
|
|
||||||
for row in dataset.rows {
|
|
||||||
let single_row_dataset = WidgetDataSet {
|
|
||||||
colnames: dataset.colnames.clone(),
|
|
||||||
status: dataset.status.clone(),
|
|
||||||
rows: vec![row],
|
|
||||||
};
|
|
||||||
result.push(single_row_dataset);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No wrapping needed - keep as single dataset
|
|
||||||
result.push(dataset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|
||||||
fn dataset_needs_wrapping_with_width(dataset: &WidgetDataSet, available_width: u16) -> bool {
|
|
||||||
// Calculate column widths
|
|
||||||
let mut column_widths = Vec::new();
|
|
||||||
for (col_index, colname) in dataset.colnames.iter().enumerate() {
|
|
||||||
let mut max_width = colname.chars().count() as u16;
|
|
||||||
|
|
||||||
// Check data rows for this column width
|
|
||||||
for row in &dataset.rows {
|
|
||||||
if let Some(widget_value) = row.values.get(col_index) {
|
|
||||||
let data_width = widget_value.chars().count() as u16;
|
|
||||||
max_width = max_width.max(data_width);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let column_width = (max_width + 1).min(25).max(6);
|
|
||||||
column_widths.push(column_width);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate total width needed
|
|
||||||
let status_col_width = 1u16;
|
|
||||||
let col_spacing = 1u16;
|
|
||||||
let mut total_width = status_col_width + col_spacing;
|
|
||||||
|
|
||||||
for &col_width in &column_widths {
|
|
||||||
total_width += col_width + col_spacing;
|
|
||||||
}
|
|
||||||
|
|
||||||
total_width > available_width
|
|
||||||
}
|
|
||||||
|
|
||||||
fn render_dataset_with_wrapping(frame: &mut Frame, dataset: &WidgetDataSet, inner: Rect, start_y: u16) -> u16 {
|
|
||||||
if dataset.colnames.is_empty() || dataset.rows.is_empty() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate column widths
|
|
||||||
let mut column_widths = Vec::new();
|
|
||||||
for (col_index, colname) in dataset.colnames.iter().enumerate() {
|
|
||||||
let mut max_width = colname.chars().count() as u16;
|
|
||||||
|
|
||||||
// Check data rows for this column width
|
|
||||||
for row in &dataset.rows {
|
|
||||||
if let Some(widget_value) = row.values.get(col_index) {
|
|
||||||
let data_width = widget_value.chars().count() as u16;
|
|
||||||
max_width = max_width.max(data_width);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let column_width = (max_width + 1).min(25).max(6);
|
|
||||||
column_widths.push(column_width);
|
|
||||||
}
|
|
||||||
|
|
||||||
let status_col_width = 1u16;
|
|
||||||
let col_spacing = 1u16;
|
|
||||||
let available_width = inner.width;
|
|
||||||
|
|
||||||
// Determine how many columns fit
|
|
||||||
let mut total_width = status_col_width + col_spacing;
|
|
||||||
let mut cols_that_fit = 0;
|
|
||||||
|
|
||||||
for &col_width in &column_widths {
|
|
||||||
let new_total = total_width + col_width + col_spacing;
|
|
||||||
if new_total <= available_width {
|
|
||||||
total_width = new_total;
|
|
||||||
cols_that_fit += 1;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if cols_that_fit == 0 {
|
|
||||||
cols_that_fit = 1; // Always show at least one column
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut current_y = start_y;
|
|
||||||
let mut col_start = 0;
|
|
||||||
let mut is_continuation = false;
|
|
||||||
|
|
||||||
// Render wrapped sections
|
|
||||||
while col_start < dataset.colnames.len() {
|
|
||||||
let col_end = (col_start + cols_that_fit).min(dataset.colnames.len());
|
|
||||||
let section_colnames = &dataset.colnames[col_start..col_end];
|
|
||||||
let section_widths = &column_widths[col_start..col_end];
|
|
||||||
|
|
||||||
// Render header for this section
|
|
||||||
let mut header_cells = vec![];
|
|
||||||
|
|
||||||
// Status cell
|
|
||||||
if is_continuation {
|
|
||||||
header_cells.push(Cell::from("↳"));
|
|
||||||
} else {
|
|
||||||
header_cells.push(Cell::from(""));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Column headers
|
|
||||||
for colname in section_colnames {
|
|
||||||
header_cells.push(Cell::from(Line::from(vec![Span::styled(
|
|
||||||
colname.clone(),
|
|
||||||
heading_row_style(),
|
|
||||||
)])));
|
|
||||||
}
|
|
||||||
|
|
||||||
let header_row = Row::new(header_cells).style(heading_row_style());
|
|
||||||
|
|
||||||
// Build constraint widths for this section
|
|
||||||
let mut constraints = vec![Constraint::Length(status_col_width)];
|
|
||||||
for &width in section_widths {
|
|
||||||
constraints.push(Constraint::Length(width));
|
|
||||||
}
|
|
||||||
|
|
||||||
let header_table = Table::new(vec![header_row])
|
|
||||||
.widths(&constraints)
|
|
||||||
.column_spacing(col_spacing)
|
|
||||||
.style(neutral_text_style());
|
|
||||||
|
|
||||||
frame.render_widget(header_table, Rect {
|
|
||||||
x: inner.x,
|
|
||||||
y: current_y,
|
|
||||||
width: inner.width,
|
|
||||||
height: 1,
|
|
||||||
});
|
|
||||||
current_y += 1;
|
|
||||||
|
|
||||||
// Render data rows for this section
|
|
||||||
for (row_idx, row) in dataset.rows.iter().enumerate() {
|
|
||||||
if current_y >= inner.y + inner.height {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if this is a sub-service - if so, render as full-width row
|
|
||||||
if row.sub_service.is_some() && col_start == 0 {
|
|
||||||
// Sub-service: render as full-width spanning row
|
|
||||||
let is_last_sub_service = is_last_sub_service_in_group(&dataset.rows, row_idx, &row.sub_service);
|
|
||||||
let tree_char = if is_last_sub_service { "└─" } else { "├─" };
|
|
||||||
let service_name = row.values.get(0).cloned().unwrap_or_default();
|
|
||||||
|
|
||||||
let status_icon = match &row.status {
|
|
||||||
Some(s) => {
|
|
||||||
let color = s.status.to_color();
|
|
||||||
let icon = s.status.to_icon();
|
|
||||||
Span::styled(icon.to_string(), Style::default().fg(color))
|
|
||||||
},
|
|
||||||
None => Span::raw(""),
|
|
||||||
};
|
|
||||||
|
|
||||||
let full_content = format!("{} {}", tree_char, service_name);
|
|
||||||
let full_cell = Cell::from(Line::from(vec![
|
|
||||||
status_icon,
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled(full_content, neutral_text_style()),
|
|
||||||
]));
|
|
||||||
|
|
||||||
let full_row = Row::new(vec![full_cell]);
|
|
||||||
let full_constraints = vec![Constraint::Length(inner.width)];
|
|
||||||
let full_table = Table::new(vec![full_row])
|
|
||||||
.widths(&full_constraints)
|
|
||||||
.style(neutral_text_style());
|
|
||||||
|
|
||||||
frame.render_widget(full_table, Rect {
|
|
||||||
x: inner.x,
|
|
||||||
y: current_y,
|
|
||||||
width: inner.width,
|
|
||||||
height: 1,
|
|
||||||
});
|
|
||||||
} else if row.sub_service.is_none() {
|
|
||||||
// Regular service: render with columns as normal
|
|
||||||
let mut cells = vec![];
|
|
||||||
|
|
||||||
// Status cell (only show on first section)
|
|
||||||
if col_start == 0 {
|
|
||||||
match &row.status {
|
|
||||||
Some(s) => {
|
|
||||||
let color = s.status.to_color();
|
|
||||||
let icon = s.status.to_icon();
|
|
||||||
cells.push(Cell::from(Line::from(vec![Span::styled(
|
|
||||||
icon.to_string(),
|
|
||||||
Style::default().fg(color),
|
|
||||||
)])));
|
|
||||||
},
|
|
||||||
None => cells.push(Cell::from("")),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
cells.push(Cell::from(""));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Data cells for this section
|
|
||||||
for col_idx in col_start..col_end {
|
|
||||||
if let Some(content) = row.values.get(col_idx) {
|
|
||||||
if content.is_empty() {
|
|
||||||
cells.push(Cell::from(""));
|
|
||||||
} else {
|
|
||||||
cells.push(Cell::from(Line::from(vec![Span::styled(
|
|
||||||
content.to_string(),
|
|
||||||
neutral_text_style(),
|
|
||||||
)])));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
cells.push(Cell::from(""));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let data_row = Row::new(cells);
|
|
||||||
let data_table = Table::new(vec![data_row])
|
|
||||||
.widths(&constraints)
|
|
||||||
.column_spacing(col_spacing)
|
|
||||||
.style(neutral_text_style());
|
|
||||||
|
|
||||||
frame.render_widget(data_table, Rect {
|
|
||||||
x: inner.x,
|
|
||||||
y: current_y,
|
|
||||||
width: inner.width,
|
|
||||||
height: 1,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
current_y += 1;
|
|
||||||
|
|
||||||
// Render description rows if any exist
|
|
||||||
for description in &row.description {
|
|
||||||
if current_y >= inner.y + inner.height {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Render description as a single cell spanning the entire width
|
|
||||||
let desc_cell = Cell::from(Line::from(vec![Span::styled(
|
|
||||||
format!(" {}", description),
|
|
||||||
Style::default().fg(Color::Blue),
|
|
||||||
)]));
|
|
||||||
|
|
||||||
let desc_row = Row::new(vec![desc_cell]);
|
|
||||||
let desc_constraints = vec![Constraint::Length(inner.width)];
|
|
||||||
let desc_table = Table::new(vec![desc_row])
|
|
||||||
.widths(&desc_constraints)
|
|
||||||
.style(neutral_text_style());
|
|
||||||
|
|
||||||
frame.render_widget(desc_table, Rect {
|
|
||||||
x: inner.x,
|
|
||||||
y: current_y,
|
|
||||||
width: inner.width,
|
|
||||||
height: 1,
|
|
||||||
});
|
|
||||||
current_y += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
col_start = col_end;
|
|
||||||
is_continuation = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
current_y - start_y
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct WidgetData {
|
|
||||||
pub title: String,
|
|
||||||
pub status: Option<WidgetStatus>,
|
|
||||||
pub dataset: WidgetDataSet,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct WidgetDataSet {
|
|
||||||
pub colnames: Vec<String>,
|
|
||||||
pub status: Option<WidgetStatus>,
|
|
||||||
pub rows: Vec<WidgetRow>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct WidgetRow {
|
|
||||||
pub status: Option<WidgetStatus>,
|
|
||||||
pub values: Vec<String>,
|
|
||||||
pub description: Vec<String>,
|
|
||||||
pub sub_service: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug)]
|
|
||||||
pub enum StatusLevel {
|
|
||||||
Ok,
|
|
||||||
Warning,
|
|
||||||
Error,
|
|
||||||
Unknown,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct WidgetStatus {
|
|
||||||
pub status: StatusLevel,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WidgetData {
|
|
||||||
pub fn new(title: impl Into<String>, status: Option<WidgetStatus>, colnames: Vec<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
title: title.into(),
|
|
||||||
status: status.clone(),
|
|
||||||
dataset: WidgetDataSet {
|
|
||||||
colnames,
|
|
||||||
status,
|
|
||||||
rows: Vec::new(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_row(&mut self, status: Option<WidgetStatus>, description: Vec<String>, values: Vec<String>) -> &mut Self {
|
|
||||||
self.add_row_with_sub_service(status, description, values, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_row_with_sub_service(&mut self, status: Option<WidgetStatus>, description: Vec<String>, values: Vec<String>, sub_service: Option<String>) -> &mut Self {
|
|
||||||
self.dataset.rows.push(WidgetRow {
|
|
||||||
status,
|
|
||||||
values,
|
|
||||||
description,
|
|
||||||
sub_service,
|
|
||||||
});
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WidgetDataSet {
|
|
||||||
pub fn new(colnames: Vec<String>, status: Option<WidgetStatus>) -> Self {
|
|
||||||
Self {
|
|
||||||
colnames,
|
|
||||||
status,
|
|
||||||
rows: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_row(&mut self, status: Option<WidgetStatus>, description: Vec<String>, values: Vec<String>) -> &mut Self {
|
|
||||||
self.add_row_with_sub_service(status, description, values, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_row_with_sub_service(&mut self, status: Option<WidgetStatus>, description: Vec<String>, values: Vec<String>, sub_service: Option<String>) -> &mut Self {
|
|
||||||
self.rows.push(WidgetRow {
|
|
||||||
status,
|
|
||||||
values,
|
|
||||||
description,
|
|
||||||
sub_service,
|
|
||||||
});
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
impl WidgetStatus {
|
|
||||||
pub fn new(status: StatusLevel) -> Self {
|
|
||||||
Self {
|
|
||||||
status,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StatusLevel {
|
|
||||||
pub fn to_color(self) -> Color {
|
|
||||||
match self {
|
|
||||||
StatusLevel::Ok => Color::Green,
|
|
||||||
StatusLevel::Warning => Color::Yellow,
|
|
||||||
StatusLevel::Error => Color::Red,
|
|
||||||
StatusLevel::Unknown => Color::Reset, // Terminal default
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn to_icon(self) -> &'static str {
|
|
||||||
match self {
|
|
||||||
StatusLevel::Ok => "✔",
|
|
||||||
StatusLevel::Warning => "!",
|
|
||||||
StatusLevel::Error => "✖",
|
|
||||||
StatusLevel::Unknown => "?",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
441
dashboard/src/ui/widgets/backup.rs
Normal file
441
dashboard/src/ui/widgets/backup.rs
Normal file
@@ -0,0 +1,441 @@
|
|||||||
|
use cm_dashboard_shared::{Metric, Status};
|
||||||
|
use ratatui::{
|
||||||
|
layout::Rect,
|
||||||
|
widgets::Paragraph,
|
||||||
|
Frame,
|
||||||
|
};
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::Widget;
|
||||||
|
use crate::ui::theme::{StatusIcons, Typography};
|
||||||
|
|
||||||
|
/// Backup widget displaying backup status, services, and repository information
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct BackupWidget {
|
||||||
|
/// Overall backup status
|
||||||
|
overall_status: Status,
|
||||||
|
/// Last backup duration in seconds
|
||||||
|
duration_seconds: Option<i64>,
|
||||||
|
/// Last backup timestamp
|
||||||
|
last_run_timestamp: Option<i64>,
|
||||||
|
/// Total number of backup services
|
||||||
|
total_services: Option<i64>,
|
||||||
|
/// Total repository size in GB
|
||||||
|
total_repo_size_gb: Option<f32>,
|
||||||
|
/// Total disk space for backups in GB
|
||||||
|
backup_disk_total_gb: Option<f32>,
|
||||||
|
/// Used disk space for backups in GB
|
||||||
|
backup_disk_used_gb: Option<f32>,
|
||||||
|
/// Backup disk product name from SMART data
|
||||||
|
backup_disk_product_name: Option<String>,
|
||||||
|
/// Backup disk serial number from SMART data
|
||||||
|
backup_disk_serial_number: Option<String>,
|
||||||
|
/// Backup disk filesystem label
|
||||||
|
backup_disk_filesystem_label: Option<String>,
|
||||||
|
/// Number of completed services
|
||||||
|
services_completed_count: Option<i64>,
|
||||||
|
/// Number of failed services
|
||||||
|
services_failed_count: Option<i64>,
|
||||||
|
/// Number of disabled services
|
||||||
|
services_disabled_count: Option<i64>,
|
||||||
|
/// All individual service metrics for detailed display
|
||||||
|
service_metrics: Vec<ServiceMetricData>,
|
||||||
|
/// Last update indicator
|
||||||
|
has_data: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ServiceMetricData {
|
||||||
|
name: String,
|
||||||
|
status: Status,
|
||||||
|
exit_code: Option<i64>,
|
||||||
|
archive_count: Option<i64>,
|
||||||
|
repo_size_gb: Option<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BackupWidget {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
overall_status: Status::Unknown,
|
||||||
|
duration_seconds: None,
|
||||||
|
last_run_timestamp: None,
|
||||||
|
total_services: None,
|
||||||
|
total_repo_size_gb: None,
|
||||||
|
backup_disk_total_gb: None,
|
||||||
|
backup_disk_used_gb: None,
|
||||||
|
backup_disk_product_name: None,
|
||||||
|
backup_disk_serial_number: None,
|
||||||
|
backup_disk_filesystem_label: None,
|
||||||
|
services_completed_count: None,
|
||||||
|
services_failed_count: None,
|
||||||
|
services_disabled_count: None,
|
||||||
|
service_metrics: Vec::new(),
|
||||||
|
has_data: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if the backup widget has any data to display
|
||||||
|
pub fn has_data(&self) -> bool {
|
||||||
|
self.has_data
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Format size with proper units (xxxkB/MB/GB/TB)
|
||||||
|
fn format_size_with_proper_units(size_gb: f32) -> String {
|
||||||
|
if size_gb >= 1000.0 {
|
||||||
|
// TB range
|
||||||
|
format!("{:.1}TB", size_gb / 1000.0)
|
||||||
|
} else if size_gb >= 1.0 {
|
||||||
|
// GB range
|
||||||
|
format!("{:.1}GB", size_gb)
|
||||||
|
} else if size_gb >= 0.001 {
|
||||||
|
// MB range (size_gb * 1024 = MB)
|
||||||
|
let size_mb = size_gb * 1024.0;
|
||||||
|
format!("{:.1}MB", size_mb)
|
||||||
|
} else if size_gb >= 0.000001 {
|
||||||
|
// kB range (size_gb * 1024 * 1024 = kB)
|
||||||
|
let size_kb = size_gb * 1024.0 * 1024.0;
|
||||||
|
format!("{:.0}kB", size_kb)
|
||||||
|
} else {
|
||||||
|
// B range (size_gb * 1024^3 = bytes)
|
||||||
|
let size_bytes = size_gb * 1024.0 * 1024.0 * 1024.0;
|
||||||
|
format!("{:.0}B", size_bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Extract service name from metric name (e.g., "backup_service_gitea_status" -> "gitea")
|
||||||
|
fn extract_service_name(metric_name: &str) -> Option<String> {
|
||||||
|
if metric_name.starts_with("backup_service_") {
|
||||||
|
let name_part = &metric_name[15..]; // Remove "backup_service_" prefix
|
||||||
|
|
||||||
|
// Try to extract service name by removing known suffixes
|
||||||
|
if let Some(service_name) = name_part.strip_suffix("_status") {
|
||||||
|
Some(service_name.to_string())
|
||||||
|
} else if let Some(service_name) = name_part.strip_suffix("_exit_code") {
|
||||||
|
Some(service_name.to_string())
|
||||||
|
} else if let Some(service_name) = name_part.strip_suffix("_archive_count") {
|
||||||
|
Some(service_name.to_string())
|
||||||
|
} else if let Some(service_name) = name_part.strip_suffix("_repo_size_gb") {
|
||||||
|
Some(service_name.to_string())
|
||||||
|
} else if let Some(service_name) = name_part.strip_suffix("_repo_path") {
|
||||||
|
Some(service_name.to_string())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Widget for BackupWidget {
|
||||||
|
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||||
|
debug!("Backup widget updating with {} metrics", metrics.len());
|
||||||
|
for metric in metrics {
|
||||||
|
debug!(
|
||||||
|
"Backup metric: {} = {:?} (status: {:?})",
|
||||||
|
metric.name, metric.value, metric.status
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also debug the service_data after processing
|
||||||
|
debug!("Processing individual service metrics...");
|
||||||
|
|
||||||
|
// Log how many metrics are backup service metrics
|
||||||
|
let service_metric_count = metrics
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.name.starts_with("backup_service_"))
|
||||||
|
.count();
|
||||||
|
debug!(
|
||||||
|
"Found {} backup_service_ metrics out of {} total backup metrics",
|
||||||
|
service_metric_count,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Reset service metrics
|
||||||
|
self.service_metrics.clear();
|
||||||
|
let mut service_data: std::collections::HashMap<String, ServiceMetricData> =
|
||||||
|
std::collections::HashMap::new();
|
||||||
|
|
||||||
|
for metric in metrics {
|
||||||
|
match metric.name.as_str() {
|
||||||
|
"backup_overall_status" => {
|
||||||
|
let status_str = metric.value.as_string();
|
||||||
|
self.overall_status = match status_str.as_str() {
|
||||||
|
"ok" => Status::Ok,
|
||||||
|
"warning" => Status::Warning,
|
||||||
|
"critical" => Status::Critical,
|
||||||
|
_ => Status::Unknown,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
"backup_duration_seconds" => {
|
||||||
|
self.duration_seconds = metric.value.as_i64();
|
||||||
|
}
|
||||||
|
"backup_last_run_timestamp" => {
|
||||||
|
self.last_run_timestamp = metric.value.as_i64();
|
||||||
|
}
|
||||||
|
"backup_total_services" => {
|
||||||
|
self.total_services = metric.value.as_i64();
|
||||||
|
}
|
||||||
|
"backup_total_repo_size_gb" => {
|
||||||
|
self.total_repo_size_gb = metric.value.as_f32();
|
||||||
|
}
|
||||||
|
"backup_disk_total_gb" => {
|
||||||
|
self.backup_disk_total_gb = metric.value.as_f32();
|
||||||
|
}
|
||||||
|
"backup_disk_used_gb" => {
|
||||||
|
self.backup_disk_used_gb = metric.value.as_f32();
|
||||||
|
}
|
||||||
|
"backup_disk_product_name" => {
|
||||||
|
self.backup_disk_product_name = Some(metric.value.as_string());
|
||||||
|
}
|
||||||
|
"backup_disk_serial_number" => {
|
||||||
|
self.backup_disk_serial_number = Some(metric.value.as_string());
|
||||||
|
}
|
||||||
|
"backup_disk_filesystem_label" => {
|
||||||
|
self.backup_disk_filesystem_label = Some(metric.value.as_string());
|
||||||
|
}
|
||||||
|
"backup_services_completed_count" => {
|
||||||
|
self.services_completed_count = metric.value.as_i64();
|
||||||
|
}
|
||||||
|
"backup_services_failed_count" => {
|
||||||
|
self.services_failed_count = metric.value.as_i64();
|
||||||
|
}
|
||||||
|
"backup_services_disabled_count" => {
|
||||||
|
self.services_disabled_count = metric.value.as_i64();
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Handle individual service metrics
|
||||||
|
if let Some(service_name) = Self::extract_service_name(&metric.name) {
|
||||||
|
debug!(
|
||||||
|
"Extracted service name '{}' from metric '{}'",
|
||||||
|
service_name, metric.name
|
||||||
|
);
|
||||||
|
let entry = service_data.entry(service_name.clone()).or_insert_with(|| {
|
||||||
|
ServiceMetricData {
|
||||||
|
name: service_name,
|
||||||
|
status: Status::Unknown,
|
||||||
|
exit_code: None,
|
||||||
|
archive_count: None,
|
||||||
|
repo_size_gb: None,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if metric.name.ends_with("_status") {
|
||||||
|
entry.status = metric.status;
|
||||||
|
debug!("Set status for {}: {:?}", entry.name, entry.status);
|
||||||
|
} else if metric.name.ends_with("_exit_code") {
|
||||||
|
entry.exit_code = metric.value.as_i64();
|
||||||
|
} else if metric.name.ends_with("_archive_count") {
|
||||||
|
entry.archive_count = metric.value.as_i64();
|
||||||
|
debug!(
|
||||||
|
"Set archive_count for {}: {:?}",
|
||||||
|
entry.name, entry.archive_count
|
||||||
|
);
|
||||||
|
} else if metric.name.ends_with("_repo_size_gb") {
|
||||||
|
entry.repo_size_gb = metric.value.as_f32();
|
||||||
|
debug!(
|
||||||
|
"Set repo_size_gb for {}: {:?}",
|
||||||
|
entry.name, entry.repo_size_gb
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
"Could not extract service name from metric: {}",
|
||||||
|
metric.name
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert service data to sorted vector
|
||||||
|
let mut services: Vec<ServiceMetricData> = service_data.into_values().collect();
|
||||||
|
services.sort_by(|a, b| a.name.cmp(&b.name));
|
||||||
|
self.service_metrics = services;
|
||||||
|
|
||||||
|
// Only show backup panel if we have meaningful backup data
|
||||||
|
self.has_data = !metrics.is_empty() && (
|
||||||
|
self.last_run_timestamp.is_some() ||
|
||||||
|
self.total_repo_size_gb.is_some() ||
|
||||||
|
!self.service_metrics.is_empty()
|
||||||
|
);
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Backup widget updated: status={:?}, services={}, total_size={:?}GB",
|
||||||
|
self.overall_status,
|
||||||
|
self.service_metrics.len(),
|
||||||
|
self.total_repo_size_gb
|
||||||
|
);
|
||||||
|
|
||||||
|
// Debug individual service data
|
||||||
|
for service in &self.service_metrics {
|
||||||
|
debug!(
|
||||||
|
"Service {}: status={:?}, archives={:?}, size={:?}GB",
|
||||||
|
service.name, service.status, service.archive_count, service.repo_size_gb
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BackupWidget {
|
||||||
|
/// Render with scroll offset support
|
||||||
|
pub fn render_with_scroll(&mut self, frame: &mut Frame, area: Rect, scroll_offset: usize) {
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
// Latest backup section
|
||||||
|
lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled("Latest backup:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
// Timestamp with status icon
|
||||||
|
let timestamp_text = if let Some(timestamp) = self.last_run_timestamp {
|
||||||
|
self.format_timestamp(timestamp)
|
||||||
|
} else {
|
||||||
|
"Unknown".to_string()
|
||||||
|
};
|
||||||
|
let timestamp_spans = StatusIcons::create_status_spans(
|
||||||
|
self.overall_status,
|
||||||
|
×tamp_text
|
||||||
|
);
|
||||||
|
lines.push(ratatui::text::Line::from(timestamp_spans));
|
||||||
|
|
||||||
|
// Duration as sub-item
|
||||||
|
if let Some(duration) = self.duration_seconds {
|
||||||
|
let duration_text = self.format_duration(duration);
|
||||||
|
lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled(" └─ ", Typography::tree()),
|
||||||
|
ratatui::text::Span::styled(format!("Duration: {}", duration_text), Typography::secondary())
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disk section
|
||||||
|
lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled("Disk:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
// Disk product name with status
|
||||||
|
if let Some(product) = &self.backup_disk_product_name {
|
||||||
|
let disk_spans = StatusIcons::create_status_spans(
|
||||||
|
Status::Ok, // Assuming disk is OK if we have data
|
||||||
|
product
|
||||||
|
);
|
||||||
|
lines.push(ratatui::text::Line::from(disk_spans));
|
||||||
|
|
||||||
|
// Serial number as sub-item
|
||||||
|
if let Some(serial) = &self.backup_disk_serial_number {
|
||||||
|
lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled(" ├─ ", Typography::tree()),
|
||||||
|
ratatui::text::Span::styled(format!("S/N: {}", serial), Typography::secondary())
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Usage as sub-item
|
||||||
|
if let (Some(used), Some(total)) = (self.backup_disk_used_gb, self.backup_disk_total_gb) {
|
||||||
|
let used_str = Self::format_size_with_proper_units(used);
|
||||||
|
let total_str = Self::format_size_with_proper_units(total);
|
||||||
|
lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled(" └─ ", Typography::tree()),
|
||||||
|
ratatui::text::Span::styled(format!("Usage: {}/{}", used_str, total_str), Typography::secondary())
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Repos section
|
||||||
|
lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled("Repos:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
// Add all repository lines (no truncation here - scroll will handle display)
|
||||||
|
for service in &self.service_metrics {
|
||||||
|
if let (Some(archives), Some(size_gb)) = (service.archive_count, service.repo_size_gb) {
|
||||||
|
let size_str = Self::format_size_with_proper_units(size_gb);
|
||||||
|
let repo_text = format!("{} ({}) {}", service.name, archives, size_str);
|
||||||
|
let repo_spans = StatusIcons::create_status_spans(service.status, &repo_text);
|
||||||
|
lines.push(ratatui::text::Line::from(repo_spans));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply scroll offset
|
||||||
|
let total_lines = lines.len();
|
||||||
|
let available_height = area.height as usize;
|
||||||
|
|
||||||
|
// Calculate scroll boundaries
|
||||||
|
let max_scroll = if total_lines > available_height {
|
||||||
|
total_lines - available_height
|
||||||
|
} else {
|
||||||
|
total_lines.saturating_sub(1)
|
||||||
|
};
|
||||||
|
let effective_scroll = scroll_offset.min(max_scroll);
|
||||||
|
|
||||||
|
// Apply scrolling if needed
|
||||||
|
if scroll_offset > 0 || total_lines > available_height {
|
||||||
|
let mut visible_lines: Vec<_> = lines
|
||||||
|
.into_iter()
|
||||||
|
.skip(effective_scroll)
|
||||||
|
.take(available_height)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Add scroll indicator if there are hidden lines
|
||||||
|
if total_lines > available_height {
|
||||||
|
let hidden_above = effective_scroll;
|
||||||
|
let hidden_below = total_lines.saturating_sub(effective_scroll + available_height);
|
||||||
|
|
||||||
|
if (hidden_above > 0 || hidden_below > 0) && !visible_lines.is_empty() {
|
||||||
|
let scroll_text = if hidden_above > 0 && hidden_below > 0 {
|
||||||
|
format!("... {} above, {} below", hidden_above, hidden_below)
|
||||||
|
} else if hidden_above > 0 {
|
||||||
|
format!("... {} more above", hidden_above)
|
||||||
|
} else {
|
||||||
|
format!("... {} more below", hidden_below)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Replace last line with scroll indicator
|
||||||
|
visible_lines.pop();
|
||||||
|
visible_lines.push(ratatui::text::Line::from(vec![
|
||||||
|
ratatui::text::Span::styled(scroll_text, Typography::muted())
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let paragraph = Paragraph::new(ratatui::text::Text::from(visible_lines));
|
||||||
|
frame.render_widget(paragraph, area);
|
||||||
|
} else {
|
||||||
|
let paragraph = Paragraph::new(ratatui::text::Text::from(lines));
|
||||||
|
frame.render_widget(paragraph, area);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BackupWidget {
|
||||||
|
/// Format timestamp for display
|
||||||
|
fn format_timestamp(&self, timestamp: i64) -> String {
|
||||||
|
let datetime = chrono::DateTime::from_timestamp(timestamp, 0)
|
||||||
|
.unwrap_or_else(|| chrono::Utc::now());
|
||||||
|
datetime.format("%Y-%m-%d %H:%M:%S").to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format duration in seconds to human readable format
|
||||||
|
fn format_duration(&self, duration_seconds: i64) -> String {
|
||||||
|
let minutes = duration_seconds / 60;
|
||||||
|
let seconds = duration_seconds % 60;
|
||||||
|
|
||||||
|
if minutes > 0 {
|
||||||
|
format!("{}.{}m", minutes, seconds / 6) // Show 1 decimal for minutes
|
||||||
|
} else {
|
||||||
|
format!("{}s", seconds)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for BackupWidget {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
1
dashboard/src/ui/widgets/cpu.rs
Normal file
1
dashboard/src/ui/widgets/cpu.rs
Normal file
@@ -0,0 +1 @@
|
|||||||
|
// This file is intentionally left minimal - CPU functionality is handled by the SystemWidget
|
||||||
1
dashboard/src/ui/widgets/memory.rs
Normal file
1
dashboard/src/ui/widgets/memory.rs
Normal file
@@ -0,0 +1 @@
|
|||||||
|
// This file is intentionally left minimal - Memory functionality is handled by the SystemWidget
|
||||||
18
dashboard/src/ui/widgets/mod.rs
Normal file
18
dashboard/src/ui/widgets/mod.rs
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
use cm_dashboard_shared::Metric;
|
||||||
|
|
||||||
|
pub mod backup;
|
||||||
|
pub mod cpu;
|
||||||
|
pub mod memory;
|
||||||
|
pub mod services;
|
||||||
|
pub mod system;
|
||||||
|
|
||||||
|
pub use backup::BackupWidget;
|
||||||
|
pub use services::ServicesWidget;
|
||||||
|
pub use system::SystemWidget;
|
||||||
|
|
||||||
|
/// Widget trait for UI components that display metrics
|
||||||
|
pub trait Widget {
|
||||||
|
/// Update widget with new metrics data
|
||||||
|
fn update_from_metrics(&mut self, metrics: &[&Metric]);
|
||||||
|
|
||||||
|
}
|
||||||
638
dashboard/src/ui/widgets/services.rs
Normal file
638
dashboard/src/ui/widgets/services.rs
Normal file
@@ -0,0 +1,638 @@
|
|||||||
|
use cm_dashboard_shared::{Metric, Status};
|
||||||
|
use ratatui::{
|
||||||
|
layout::{Constraint, Direction, Layout, Rect},
|
||||||
|
widgets::Paragraph,
|
||||||
|
Frame,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::Widget;
|
||||||
|
use crate::ui::theme::{Components, StatusIcons, Theme, Typography};
|
||||||
|
use crate::ui::CommandType;
|
||||||
|
use ratatui::style::Style;
|
||||||
|
|
||||||
|
/// Services widget displaying hierarchical systemd service statuses
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct ServicesWidget {
|
||||||
|
/// Parent services (nginx, docker, etc.)
|
||||||
|
parent_services: HashMap<String, ServiceInfo>,
|
||||||
|
/// Sub-services grouped by parent (nginx -> [gitea, mariehall, ...], docker -> [container1, ...])
|
||||||
|
sub_services: HashMap<String, Vec<(String, ServiceInfo)>>,
|
||||||
|
/// Aggregated status
|
||||||
|
status: Status,
|
||||||
|
/// Last update indicator
|
||||||
|
has_data: bool,
|
||||||
|
/// Currently selected service index (for navigation cursor)
|
||||||
|
selected_index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct ServiceInfo {
|
||||||
|
status: String,
|
||||||
|
memory_mb: Option<f32>,
|
||||||
|
disk_gb: Option<f32>,
|
||||||
|
latency_ms: Option<f32>,
|
||||||
|
widget_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ServicesWidget {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
parent_services: HashMap::new(),
|
||||||
|
sub_services: HashMap::new(),
|
||||||
|
status: Status::Unknown,
|
||||||
|
has_data: false,
|
||||||
|
selected_index: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract service name and determine if it's a parent or sub-service
|
||||||
|
fn extract_service_info(metric_name: &str) -> Option<(String, Option<String>)> {
|
||||||
|
if metric_name.starts_with("service_") {
|
||||||
|
if let Some(end_pos) = metric_name
|
||||||
|
.rfind("_status")
|
||||||
|
.or_else(|| metric_name.rfind("_memory_mb"))
|
||||||
|
.or_else(|| metric_name.rfind("_disk_gb"))
|
||||||
|
.or_else(|| metric_name.rfind("_latency_ms"))
|
||||||
|
{
|
||||||
|
let service_part = &metric_name[8..end_pos]; // Remove "service_" prefix
|
||||||
|
|
||||||
|
// Check for sub-services patterns
|
||||||
|
if service_part.starts_with("nginx_") {
|
||||||
|
// nginx sub-services: service_nginx_gitea_latency_ms -> ("nginx", "gitea")
|
||||||
|
let sub_service = service_part.strip_prefix("nginx_").unwrap_or(service_part);
|
||||||
|
return Some(("nginx".to_string(), Some(sub_service.to_string())));
|
||||||
|
} else if service_part.starts_with("docker_") {
|
||||||
|
// docker sub-services: service_docker_container1_status -> ("docker", "container1")
|
||||||
|
let sub_service = service_part.strip_prefix("docker_").unwrap_or(service_part);
|
||||||
|
return Some(("docker".to_string(), Some(sub_service.to_string())));
|
||||||
|
} else {
|
||||||
|
// Regular parent service: service_nginx_status -> ("nginx", None)
|
||||||
|
return Some((service_part.to_string(), None));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format disk size with appropriate units (kB/MB/GB)
|
||||||
|
fn format_disk_size(size_gb: f32) -> String {
|
||||||
|
let size_mb = size_gb * 1024.0; // Convert GB to MB
|
||||||
|
|
||||||
|
if size_mb >= 1024.0 {
|
||||||
|
// Show as GB
|
||||||
|
format!("{:.1}GB", size_gb)
|
||||||
|
} else if size_mb >= 1.0 {
|
||||||
|
// Show as MB
|
||||||
|
format!("{:.0}MB", size_mb)
|
||||||
|
} else if size_mb >= 0.001 {
|
||||||
|
// Convert to kB
|
||||||
|
let size_kb = size_mb * 1024.0;
|
||||||
|
format!("{:.0}kB", size_kb)
|
||||||
|
} else {
|
||||||
|
// Show very small sizes as bytes
|
||||||
|
let size_bytes = size_mb * 1024.0 * 1024.0;
|
||||||
|
format!("{:.0}B", size_bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format parent service line - returns text without icon for span formatting
|
||||||
|
fn format_parent_service_line(&self, name: &str, info: &ServiceInfo) -> String {
|
||||||
|
let memory_str = info
|
||||||
|
.memory_mb
|
||||||
|
.map_or("0M".to_string(), |m| format!("{:.0}M", m));
|
||||||
|
let disk_str = info
|
||||||
|
.disk_gb
|
||||||
|
.map_or("0".to_string(), |d| Self::format_disk_size(d));
|
||||||
|
|
||||||
|
// Truncate long service names to fit layout (account for icon space)
|
||||||
|
let short_name = if name.len() > 22 {
|
||||||
|
format!("{}...", &name[..19])
|
||||||
|
} else {
|
||||||
|
name.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parent services always show actual systemctl status
|
||||||
|
let status_str = match info.widget_status {
|
||||||
|
Status::Pending => "pending".to_string(),
|
||||||
|
_ => info.status.clone(), // Use actual status from agent (active/inactive/failed)
|
||||||
|
};
|
||||||
|
|
||||||
|
format!(
|
||||||
|
"{:<23} {:<10} {:<8} {:<8}",
|
||||||
|
short_name, status_str, memory_str, disk_str
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get status icon for service, considering pending transitions for visual feedback
|
||||||
|
fn get_service_icon_and_status(&self, service_name: &str, info: &ServiceInfo, pending_transitions: &HashMap<String, (CommandType, String, std::time::Instant)>) -> (String, String, ratatui::prelude::Color) {
|
||||||
|
// Check if this service has a pending transition
|
||||||
|
if let Some((command_type, _original_status, _start_time)) = pending_transitions.get(service_name) {
|
||||||
|
// Show transitional icons for pending commands
|
||||||
|
let (icon, status_text) = match command_type {
|
||||||
|
CommandType::ServiceStart => ("↑", "starting"),
|
||||||
|
CommandType::ServiceStop => ("↓", "stopping"),
|
||||||
|
_ => return (StatusIcons::get_icon(info.widget_status).to_string(), info.status.clone(), Theme::status_color(info.widget_status)), // Not a service command
|
||||||
|
};
|
||||||
|
return (icon.to_string(), status_text.to_string(), Theme::highlight());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normal status display
|
||||||
|
let icon = StatusIcons::get_icon(info.widget_status);
|
||||||
|
let status_color = match info.widget_status {
|
||||||
|
Status::Ok => Theme::success(),
|
||||||
|
Status::Pending => Theme::highlight(),
|
||||||
|
Status::Warning => Theme::warning(),
|
||||||
|
Status::Critical => Theme::error(),
|
||||||
|
Status::Unknown => Theme::muted_text(),
|
||||||
|
};
|
||||||
|
|
||||||
|
(icon.to_string(), info.status.clone(), status_color)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Create spans for sub-service with icon next to name, considering pending transitions
|
||||||
|
fn create_sub_service_spans_with_transitions(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
info: &ServiceInfo,
|
||||||
|
is_last: bool,
|
||||||
|
pending_transitions: &HashMap<String, (CommandType, String, std::time::Instant)>,
|
||||||
|
) -> Vec<ratatui::text::Span<'static>> {
|
||||||
|
// Truncate long sub-service names to fit layout (accounting for indentation)
|
||||||
|
let short_name = if name.len() > 18 {
|
||||||
|
format!("{}...", &name[..15])
|
||||||
|
} else {
|
||||||
|
name.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get status icon and text, considering pending transitions
|
||||||
|
let (icon, mut status_str, status_color) = self.get_service_icon_and_status(name, info, pending_transitions);
|
||||||
|
|
||||||
|
// For sub-services, prefer latency if available (unless transition is pending)
|
||||||
|
if !pending_transitions.contains_key(name) {
|
||||||
|
if let Some(latency) = info.latency_ms {
|
||||||
|
status_str = if latency < 0.0 {
|
||||||
|
"timeout".to_string()
|
||||||
|
} else {
|
||||||
|
format!("{:.0}ms", latency)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let tree_symbol = if is_last { "└─" } else { "├─" };
|
||||||
|
|
||||||
|
vec![
|
||||||
|
// Indentation and tree prefix
|
||||||
|
ratatui::text::Span::styled(
|
||||||
|
format!(" {} ", tree_symbol),
|
||||||
|
Typography::tree(),
|
||||||
|
),
|
||||||
|
// Status icon
|
||||||
|
ratatui::text::Span::styled(
|
||||||
|
format!("{} ", icon),
|
||||||
|
Style::default().fg(status_color).bg(Theme::background()),
|
||||||
|
),
|
||||||
|
// Service name
|
||||||
|
ratatui::text::Span::styled(
|
||||||
|
format!("{:<18} ", short_name),
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::secondary_text())
|
||||||
|
.bg(Theme::background()),
|
||||||
|
),
|
||||||
|
// Status/latency text
|
||||||
|
ratatui::text::Span::styled(
|
||||||
|
status_str,
|
||||||
|
Style::default()
|
||||||
|
.fg(Theme::secondary_text())
|
||||||
|
.bg(Theme::background()),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Move selection up
|
||||||
|
pub fn select_previous(&mut self) {
|
||||||
|
if self.selected_index > 0 {
|
||||||
|
self.selected_index -= 1;
|
||||||
|
}
|
||||||
|
debug!("Service selection moved up to: {}", self.selected_index);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Move selection down
|
||||||
|
pub fn select_next(&mut self, total_services: usize) {
|
||||||
|
if total_services > 0 && self.selected_index < total_services.saturating_sub(1) {
|
||||||
|
self.selected_index += 1;
|
||||||
|
}
|
||||||
|
debug!("Service selection: {}/{}", self.selected_index, total_services);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get currently selected service name (for actions)
|
||||||
|
pub fn get_selected_service(&self) -> Option<String> {
|
||||||
|
// Build the same display list to find the selected service
|
||||||
|
let mut display_lines: Vec<(String, Status, bool, Option<(ServiceInfo, bool)>, String)> = Vec::new();
|
||||||
|
|
||||||
|
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||||
|
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||||
|
|
||||||
|
for (parent_name, parent_info) in parent_services {
|
||||||
|
let parent_line = self.format_parent_service_line(parent_name, parent_info);
|
||||||
|
display_lines.push((parent_line, parent_info.widget_status, false, None, parent_name.clone()));
|
||||||
|
|
||||||
|
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||||
|
let mut sorted_subs = sub_list.clone();
|
||||||
|
sorted_subs.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||||
|
|
||||||
|
for (i, (sub_name, sub_info)) in sorted_subs.iter().enumerate() {
|
||||||
|
let is_last_sub = i == sorted_subs.len() - 1;
|
||||||
|
let full_sub_name = format!("{}_{}", parent_name, sub_name);
|
||||||
|
display_lines.push((
|
||||||
|
sub_name.clone(),
|
||||||
|
sub_info.widget_status,
|
||||||
|
true,
|
||||||
|
Some((sub_info.clone(), is_last_sub)),
|
||||||
|
full_sub_name,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
display_lines.get(self.selected_index).map(|(_, _, _, _, raw_name)| raw_name.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get total count of selectable services (parent services only, not sub-services)
|
||||||
|
pub fn get_total_services_count(&self) -> usize {
|
||||||
|
// Only count parent services - sub-services are not selectable
|
||||||
|
self.parent_services.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current status of a specific service by name
|
||||||
|
pub fn get_service_status(&self, service_name: &str) -> Option<String> {
|
||||||
|
// Check if it's a parent service
|
||||||
|
if let Some(parent_info) = self.parent_services.get(service_name) {
|
||||||
|
return Some(parent_info.status.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check sub-services (format: parent_sub)
|
||||||
|
for (parent_name, sub_list) in &self.sub_services {
|
||||||
|
for (sub_name, sub_info) in sub_list {
|
||||||
|
let full_sub_name = format!("{}_{}", parent_name, sub_name);
|
||||||
|
if full_sub_name == service_name {
|
||||||
|
return Some(sub_info.status.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate which parent service index corresponds to a display line index
|
||||||
|
fn calculate_parent_service_index(&self, display_line_index: &usize) -> usize {
|
||||||
|
// Build the same display list to map line index to parent service index
|
||||||
|
let mut parent_index = 0;
|
||||||
|
let mut line_index = 0;
|
||||||
|
|
||||||
|
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||||
|
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||||
|
|
||||||
|
for (parent_name, _) in parent_services {
|
||||||
|
if line_index == *display_line_index {
|
||||||
|
return parent_index;
|
||||||
|
}
|
||||||
|
line_index += 1; // Parent service line
|
||||||
|
|
||||||
|
// Skip sub-services but count them in line_index
|
||||||
|
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||||
|
line_index += sub_list.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
parent_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we get here, the display_line_index was probably for a sub-service
|
||||||
|
// Return the last valid parent index (should not happen with our logic)
|
||||||
|
parent_index.saturating_sub(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Widget for ServicesWidget {
|
||||||
|
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||||
|
debug!("Services widget updating with {} metrics", metrics.len());
|
||||||
|
|
||||||
|
// Don't clear existing services - preserve data between metric batches
|
||||||
|
|
||||||
|
// Process individual service metrics
|
||||||
|
for metric in metrics {
|
||||||
|
if let Some((parent_service, sub_service)) = Self::extract_service_info(&metric.name) {
|
||||||
|
match sub_service {
|
||||||
|
None => {
|
||||||
|
// Parent service metric
|
||||||
|
let service_info =
|
||||||
|
self.parent_services
|
||||||
|
.entry(parent_service)
|
||||||
|
.or_insert(ServiceInfo {
|
||||||
|
status: "unknown".to_string(),
|
||||||
|
memory_mb: None,
|
||||||
|
disk_gb: None,
|
||||||
|
latency_ms: None,
|
||||||
|
widget_status: Status::Unknown,
|
||||||
|
});
|
||||||
|
|
||||||
|
if metric.name.ends_with("_status") {
|
||||||
|
service_info.status = metric.value.as_string();
|
||||||
|
service_info.widget_status = metric.status;
|
||||||
|
} else if metric.name.ends_with("_memory_mb") {
|
||||||
|
if let Some(memory) = metric.value.as_f32() {
|
||||||
|
service_info.memory_mb = Some(memory);
|
||||||
|
}
|
||||||
|
} else if metric.name.ends_with("_disk_gb") {
|
||||||
|
if let Some(disk) = metric.value.as_f32() {
|
||||||
|
service_info.disk_gb = Some(disk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(sub_name) => {
|
||||||
|
// Sub-service metric
|
||||||
|
let sub_service_list = self
|
||||||
|
.sub_services
|
||||||
|
.entry(parent_service)
|
||||||
|
.or_insert_with(Vec::new);
|
||||||
|
|
||||||
|
// Find existing sub-service or create new one
|
||||||
|
let sub_service_info = if let Some(pos) = sub_service_list
|
||||||
|
.iter()
|
||||||
|
.position(|(name, _)| name == &sub_name)
|
||||||
|
{
|
||||||
|
&mut sub_service_list[pos].1
|
||||||
|
} else {
|
||||||
|
sub_service_list.push((
|
||||||
|
sub_name.clone(),
|
||||||
|
ServiceInfo {
|
||||||
|
status: "unknown".to_string(),
|
||||||
|
memory_mb: None,
|
||||||
|
disk_gb: None,
|
||||||
|
latency_ms: None,
|
||||||
|
widget_status: Status::Unknown,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
&mut sub_service_list.last_mut().unwrap().1
|
||||||
|
};
|
||||||
|
|
||||||
|
if metric.name.ends_with("_status") {
|
||||||
|
sub_service_info.status = metric.value.as_string();
|
||||||
|
sub_service_info.widget_status = metric.status;
|
||||||
|
} else if metric.name.ends_with("_memory_mb") {
|
||||||
|
if let Some(memory) = metric.value.as_f32() {
|
||||||
|
sub_service_info.memory_mb = Some(memory);
|
||||||
|
}
|
||||||
|
} else if metric.name.ends_with("_disk_gb") {
|
||||||
|
if let Some(disk) = metric.value.as_f32() {
|
||||||
|
sub_service_info.disk_gb = Some(disk);
|
||||||
|
}
|
||||||
|
} else if metric.name.ends_with("_latency_ms") {
|
||||||
|
if let Some(latency) = metric.value.as_f32() {
|
||||||
|
sub_service_info.latency_ms = Some(latency);
|
||||||
|
sub_service_info.widget_status = metric.status;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate status from all parent and sub-services
|
||||||
|
let mut all_statuses = Vec::new();
|
||||||
|
|
||||||
|
// Add parent service statuses
|
||||||
|
all_statuses.extend(self.parent_services.values().map(|info| info.widget_status));
|
||||||
|
|
||||||
|
// Add sub-service statuses
|
||||||
|
for sub_list in self.sub_services.values() {
|
||||||
|
all_statuses.extend(sub_list.iter().map(|(_, info)| info.widget_status));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.status = if all_statuses.is_empty() {
|
||||||
|
Status::Unknown
|
||||||
|
} else {
|
||||||
|
Status::aggregate(&all_statuses)
|
||||||
|
};
|
||||||
|
|
||||||
|
self.has_data = !self.parent_services.is_empty() || !self.sub_services.is_empty();
|
||||||
|
|
||||||
|
// Ensure selection index is within bounds after update
|
||||||
|
let total_count = self.get_total_services_count();
|
||||||
|
if self.selected_index >= total_count && total_count > 0 {
|
||||||
|
self.selected_index = total_count - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Services widget updated: {} parent services, {} sub-service groups, total={}, selected={}, status={:?}",
|
||||||
|
self.parent_services.len(),
|
||||||
|
self.sub_services.len(),
|
||||||
|
total_count,
|
||||||
|
self.selected_index,
|
||||||
|
self.status
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ServicesWidget {
|
||||||
|
|
||||||
|
/// Render with focus, scroll, and pending transitions for visual feedback
|
||||||
|
pub fn render_with_transitions(&mut self, frame: &mut Frame, area: Rect, is_focused: bool, scroll_offset: usize, pending_transitions: &HashMap<String, (CommandType, String, std::time::Instant)>) {
|
||||||
|
let services_block = Components::widget_block("services");
|
||||||
|
let inner_area = services_block.inner(area);
|
||||||
|
frame.render_widget(services_block, area);
|
||||||
|
|
||||||
|
let content_chunks = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([Constraint::Length(1), Constraint::Min(0)])
|
||||||
|
.split(inner_area);
|
||||||
|
|
||||||
|
// Header
|
||||||
|
let header = format!(
|
||||||
|
"{:<25} {:<10} {:<8} {:<8}",
|
||||||
|
"Service:", "Status:", "RAM:", "Disk:"
|
||||||
|
);
|
||||||
|
let header_para = Paragraph::new(header).style(Typography::muted());
|
||||||
|
frame.render_widget(header_para, content_chunks[0]);
|
||||||
|
|
||||||
|
// Check if we have any services to display
|
||||||
|
if self.parent_services.is_empty() && self.sub_services.is_empty() {
|
||||||
|
let empty_text = Paragraph::new("No process data").style(Typography::muted());
|
||||||
|
frame.render_widget(empty_text, content_chunks[1]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the existing render logic but with pending transitions
|
||||||
|
self.render_services_with_transitions(frame, content_chunks[1], is_focused, scroll_offset, pending_transitions);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render services list with pending transitions awareness
|
||||||
|
fn render_services_with_transitions(&mut self, frame: &mut Frame, area: Rect, is_focused: bool, scroll_offset: usize, pending_transitions: &HashMap<String, (CommandType, String, std::time::Instant)>) {
|
||||||
|
// Build hierarchical service list for display - include raw service name for pending transition lookups
|
||||||
|
let mut display_lines: Vec<(String, Status, bool, Option<(ServiceInfo, bool)>, String)> = Vec::new(); // Added raw service name
|
||||||
|
|
||||||
|
// Sort parent services alphabetically for consistent order
|
||||||
|
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||||
|
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||||
|
|
||||||
|
for (parent_name, parent_info) in parent_services {
|
||||||
|
// Add parent service line
|
||||||
|
let parent_line = self.format_parent_service_line(parent_name, parent_info);
|
||||||
|
display_lines.push((parent_line, parent_info.widget_status, false, None, parent_name.clone())); // Include raw name
|
||||||
|
|
||||||
|
// Add sub-services for this parent (if any)
|
||||||
|
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||||
|
// Sort sub-services by name for consistent display
|
||||||
|
let mut sorted_subs = sub_list.clone();
|
||||||
|
sorted_subs.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||||
|
|
||||||
|
for (i, (sub_name, sub_info)) in sorted_subs.iter().enumerate() {
|
||||||
|
let is_last_sub = i == sorted_subs.len() - 1;
|
||||||
|
let full_sub_name = format!("{}_{}", parent_name, sub_name);
|
||||||
|
// Store sub-service info for custom span rendering
|
||||||
|
display_lines.push((
|
||||||
|
sub_name.clone(),
|
||||||
|
sub_info.widget_status,
|
||||||
|
true,
|
||||||
|
Some((sub_info.clone(), is_last_sub)),
|
||||||
|
full_sub_name, // Raw service name for pending transition lookup
|
||||||
|
)); // true = sub-service, with is_last info
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply scroll offset and render visible lines (same as existing logic)
|
||||||
|
let available_lines = area.height as usize;
|
||||||
|
let total_lines = display_lines.len();
|
||||||
|
|
||||||
|
// Calculate scroll boundaries
|
||||||
|
let max_scroll = if total_lines > available_lines {
|
||||||
|
total_lines - available_lines
|
||||||
|
} else {
|
||||||
|
total_lines.saturating_sub(1)
|
||||||
|
};
|
||||||
|
let effective_scroll = scroll_offset.min(max_scroll);
|
||||||
|
|
||||||
|
// Get visible lines after scrolling
|
||||||
|
let visible_lines: Vec<_> = display_lines
|
||||||
|
.iter()
|
||||||
|
.skip(effective_scroll)
|
||||||
|
.take(available_lines)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let lines_to_show = visible_lines.len();
|
||||||
|
|
||||||
|
if lines_to_show > 0 {
|
||||||
|
let service_chunks = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints(vec![Constraint::Length(1); lines_to_show])
|
||||||
|
.split(area);
|
||||||
|
|
||||||
|
for (i, (line_text, line_status, is_sub, sub_info, raw_service_name)) in visible_lines.iter().enumerate()
|
||||||
|
{
|
||||||
|
let actual_index = effective_scroll + i; // Real index in the full list
|
||||||
|
|
||||||
|
// Only parent services can be selected - calculate parent service index
|
||||||
|
let is_selected = if !*is_sub {
|
||||||
|
// This is a parent service - count how many parent services came before this one
|
||||||
|
let parent_index = self.calculate_parent_service_index(&actual_index);
|
||||||
|
parent_index == self.selected_index
|
||||||
|
} else {
|
||||||
|
false // Sub-services are never selected
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut spans = if *is_sub && sub_info.is_some() {
|
||||||
|
// Use custom sub-service span creation WITH pending transitions
|
||||||
|
let (service_info, is_last) = sub_info.as_ref().unwrap();
|
||||||
|
self.create_sub_service_spans_with_transitions(line_text, service_info, *is_last, pending_transitions)
|
||||||
|
} else {
|
||||||
|
// Parent services - check if this parent service has a pending transition using RAW service name
|
||||||
|
if pending_transitions.contains_key(raw_service_name) {
|
||||||
|
// Create spans with transitional status
|
||||||
|
let (icon, status_text, _) = self.get_service_icon_and_status(raw_service_name, &ServiceInfo {
|
||||||
|
status: "".to_string(),
|
||||||
|
memory_mb: None,
|
||||||
|
disk_gb: None,
|
||||||
|
latency_ms: None,
|
||||||
|
widget_status: *line_status
|
||||||
|
}, pending_transitions);
|
||||||
|
|
||||||
|
// Use blue for transitional icons when not selected, background color when selected
|
||||||
|
let icon_color = if is_selected && !*is_sub && is_focused {
|
||||||
|
Theme::background() // Dark background color for visibility against blue selection
|
||||||
|
} else {
|
||||||
|
Theme::highlight() // Blue for normal case
|
||||||
|
};
|
||||||
|
|
||||||
|
vec![
|
||||||
|
ratatui::text::Span::styled(format!("{} ", icon), Style::default().fg(icon_color)),
|
||||||
|
ratatui::text::Span::styled(line_text.clone(), Style::default().fg(Theme::primary_text())),
|
||||||
|
ratatui::text::Span::styled(format!(" {}", status_text), Style::default().fg(icon_color)),
|
||||||
|
]
|
||||||
|
} else {
|
||||||
|
StatusIcons::create_status_spans(*line_status, line_text)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Apply selection highlighting to parent services only, making icons background color when selected
|
||||||
|
// Only show selection when Services panel is focused
|
||||||
|
// Show selection highlighting even when transitional icons are present
|
||||||
|
if is_selected && !*is_sub && is_focused {
|
||||||
|
for (i, span) in spans.iter_mut().enumerate() {
|
||||||
|
if i == 0 {
|
||||||
|
// First span is the status icon - use background color for visibility against blue selection
|
||||||
|
span.style = span.style
|
||||||
|
.bg(Theme::highlight())
|
||||||
|
.fg(Theme::background());
|
||||||
|
} else {
|
||||||
|
// Other spans (text) get full selection highlighting
|
||||||
|
span.style = span.style
|
||||||
|
.bg(Theme::highlight())
|
||||||
|
.fg(Theme::background());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let service_para = Paragraph::new(ratatui::text::Line::from(spans));
|
||||||
|
|
||||||
|
frame.render_widget(service_para, service_chunks[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show scroll indicator if there are more services than we can display (same as existing)
|
||||||
|
if total_lines > available_lines {
|
||||||
|
let hidden_above = effective_scroll;
|
||||||
|
let hidden_below = total_lines.saturating_sub(effective_scroll + available_lines);
|
||||||
|
|
||||||
|
if hidden_above > 0 || hidden_below > 0 {
|
||||||
|
let scroll_text = if hidden_above > 0 && hidden_below > 0 {
|
||||||
|
format!("... {} above, {} below", hidden_above, hidden_below)
|
||||||
|
} else if hidden_above > 0 {
|
||||||
|
format!("... {} more above", hidden_above)
|
||||||
|
} else {
|
||||||
|
format!("... {} more below", hidden_below)
|
||||||
|
};
|
||||||
|
|
||||||
|
if available_lines > 0 && lines_to_show > 0 {
|
||||||
|
let last_line_area = Rect {
|
||||||
|
x: area.x,
|
||||||
|
y: area.y + (lines_to_show - 1) as u16,
|
||||||
|
width: area.width,
|
||||||
|
height: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
let scroll_para = Paragraph::new(scroll_text).style(Typography::muted());
|
||||||
|
frame.render_widget(scroll_para, last_line_area);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ServicesWidget {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
577
dashboard/src/ui/widgets/system.rs
Normal file
577
dashboard/src/ui/widgets/system.rs
Normal file
@@ -0,0 +1,577 @@
|
|||||||
|
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||||
|
use ratatui::{
|
||||||
|
layout::Rect,
|
||||||
|
text::{Line, Span, Text},
|
||||||
|
widgets::Paragraph,
|
||||||
|
Frame,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::Widget;
|
||||||
|
use crate::ui::theme::{StatusIcons, Typography};
|
||||||
|
|
||||||
|
/// System widget displaying NixOS info, CPU, RAM, and Storage in unified layout
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SystemWidget {
|
||||||
|
// NixOS information
|
||||||
|
nixos_build: Option<String>,
|
||||||
|
config_hash: Option<String>,
|
||||||
|
agent_hash: Option<String>,
|
||||||
|
|
||||||
|
// CPU metrics
|
||||||
|
cpu_load_1min: Option<f32>,
|
||||||
|
cpu_load_5min: Option<f32>,
|
||||||
|
cpu_load_15min: Option<f32>,
|
||||||
|
cpu_frequency: Option<f32>,
|
||||||
|
cpu_status: Status,
|
||||||
|
|
||||||
|
// Memory metrics
|
||||||
|
memory_usage_percent: Option<f32>,
|
||||||
|
memory_used_gb: Option<f32>,
|
||||||
|
memory_total_gb: Option<f32>,
|
||||||
|
tmp_usage_percent: Option<f32>,
|
||||||
|
tmp_used_gb: Option<f32>,
|
||||||
|
tmp_total_gb: Option<f32>,
|
||||||
|
memory_status: Status,
|
||||||
|
tmp_status: Status,
|
||||||
|
|
||||||
|
// Storage metrics (collected from disk metrics)
|
||||||
|
storage_pools: Vec<StoragePool>,
|
||||||
|
|
||||||
|
// Overall status
|
||||||
|
has_data: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct StoragePool {
|
||||||
|
name: String,
|
||||||
|
mount_point: String,
|
||||||
|
pool_type: String, // "Single", "Raid0", etc.
|
||||||
|
drives: Vec<StorageDrive>,
|
||||||
|
usage_percent: Option<f32>,
|
||||||
|
used_gb: Option<f32>,
|
||||||
|
total_gb: Option<f32>,
|
||||||
|
status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct StorageDrive {
|
||||||
|
name: String,
|
||||||
|
temperature: Option<f32>,
|
||||||
|
wear_percent: Option<f32>,
|
||||||
|
status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemWidget {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
nixos_build: None,
|
||||||
|
config_hash: None,
|
||||||
|
agent_hash: None,
|
||||||
|
cpu_load_1min: None,
|
||||||
|
cpu_load_5min: None,
|
||||||
|
cpu_load_15min: None,
|
||||||
|
cpu_frequency: None,
|
||||||
|
cpu_status: Status::Unknown,
|
||||||
|
memory_usage_percent: None,
|
||||||
|
memory_used_gb: None,
|
||||||
|
memory_total_gb: None,
|
||||||
|
tmp_usage_percent: None,
|
||||||
|
tmp_used_gb: None,
|
||||||
|
tmp_total_gb: None,
|
||||||
|
memory_status: Status::Unknown,
|
||||||
|
tmp_status: Status::Unknown,
|
||||||
|
storage_pools: Vec::new(),
|
||||||
|
has_data: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format CPU load averages
|
||||||
|
fn format_cpu_load(&self) -> String {
|
||||||
|
match (self.cpu_load_1min, self.cpu_load_5min, self.cpu_load_15min) {
|
||||||
|
(Some(l1), Some(l5), Some(l15)) => {
|
||||||
|
format!("{:.2} {:.2} {:.2}", l1, l5, l15)
|
||||||
|
}
|
||||||
|
_ => "— — —".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format CPU frequency
|
||||||
|
fn format_cpu_frequency(&self) -> String {
|
||||||
|
match self.cpu_frequency {
|
||||||
|
Some(freq) => format!("{:.0} MHz", freq),
|
||||||
|
None => "— MHz".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format memory usage
|
||||||
|
fn format_memory_usage(&self) -> String {
|
||||||
|
match (self.memory_usage_percent, self.memory_used_gb, self.memory_total_gb) {
|
||||||
|
(Some(pct), Some(used), Some(total)) => {
|
||||||
|
format!("{:.0}% {:.1}GB/{:.1}GB", pct, used, total)
|
||||||
|
}
|
||||||
|
_ => "—% —GB/—GB".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format /tmp usage
|
||||||
|
fn format_tmp_usage(&self) -> String {
|
||||||
|
match (self.tmp_usage_percent, self.tmp_used_gb, self.tmp_total_gb) {
|
||||||
|
(Some(pct), Some(used), Some(total)) => {
|
||||||
|
let used_str = if used < 0.1 {
|
||||||
|
format!("{:.0}B", used * 1024.0) // Show as MB if very small
|
||||||
|
} else {
|
||||||
|
format!("{:.1}GB", used)
|
||||||
|
};
|
||||||
|
format!("{:.0}% {}/{:.1}GB", pct, used_str, total)
|
||||||
|
}
|
||||||
|
_ => "—% —GB/—GB".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the current agent hash for rebuild completion detection
|
||||||
|
pub fn _get_agent_hash(&self) -> Option<&String> {
|
||||||
|
self.agent_hash.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get mount point for a pool name
|
||||||
|
fn get_mount_point_for_pool(&self, pool_name: &str) -> String {
|
||||||
|
match pool_name {
|
||||||
|
"root" => "/".to_string(),
|
||||||
|
"steampool" => "/mnt/steampool".to_string(),
|
||||||
|
"steampool_1" => "/steampool_1".to_string(),
|
||||||
|
"steampool_2" => "/steampool_2".to_string(),
|
||||||
|
_ => format!("/{}", pool_name), // Default fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse storage metrics into pools and drives
|
||||||
|
fn update_storage_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||||
|
let mut pools: std::collections::HashMap<String, StoragePool> = std::collections::HashMap::new();
|
||||||
|
|
||||||
|
for metric in metrics {
|
||||||
|
if metric.name.starts_with("disk_") {
|
||||||
|
if let Some(pool_name) = self.extract_pool_name(&metric.name) {
|
||||||
|
let mount_point = self.get_mount_point_for_pool(&pool_name);
|
||||||
|
let pool = pools.entry(pool_name.clone()).or_insert_with(|| StoragePool {
|
||||||
|
name: pool_name.clone(),
|
||||||
|
mount_point: mount_point.clone(),
|
||||||
|
pool_type: "Single".to_string(), // Default, could be enhanced
|
||||||
|
drives: Vec::new(),
|
||||||
|
usage_percent: None,
|
||||||
|
used_gb: None,
|
||||||
|
total_gb: None,
|
||||||
|
status: Status::Unknown,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Parse different metric types
|
||||||
|
if metric.name.contains("_usage_percent") {
|
||||||
|
if let MetricValue::Float(usage) = metric.value {
|
||||||
|
pool.usage_percent = Some(usage);
|
||||||
|
pool.status = metric.status.clone();
|
||||||
|
}
|
||||||
|
} else if metric.name.contains("_used_gb") {
|
||||||
|
if let MetricValue::Float(used) = metric.value {
|
||||||
|
pool.used_gb = Some(used);
|
||||||
|
}
|
||||||
|
} else if metric.name.contains("_total_gb") {
|
||||||
|
if let MetricValue::Float(total) = metric.value {
|
||||||
|
pool.total_gb = Some(total);
|
||||||
|
}
|
||||||
|
} else if metric.name.contains("_temperature") {
|
||||||
|
if let Some(drive_name) = self.extract_drive_name(&metric.name) {
|
||||||
|
// Find existing drive or create new one
|
||||||
|
let drive_exists = pool.drives.iter().any(|d| d.name == drive_name);
|
||||||
|
if !drive_exists {
|
||||||
|
pool.drives.push(StorageDrive {
|
||||||
|
name: drive_name.clone(),
|
||||||
|
temperature: None,
|
||||||
|
wear_percent: None,
|
||||||
|
status: Status::Unknown,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(drive) = pool.drives.iter_mut().find(|d| d.name == drive_name) {
|
||||||
|
if let MetricValue::Float(temp) = metric.value {
|
||||||
|
drive.temperature = Some(temp);
|
||||||
|
drive.status = metric.status.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if metric.name.contains("_wear_percent") {
|
||||||
|
if let Some(drive_name) = self.extract_drive_name(&metric.name) {
|
||||||
|
// Find existing drive or create new one
|
||||||
|
let drive_exists = pool.drives.iter().any(|d| d.name == drive_name);
|
||||||
|
if !drive_exists {
|
||||||
|
pool.drives.push(StorageDrive {
|
||||||
|
name: drive_name.clone(),
|
||||||
|
temperature: None,
|
||||||
|
wear_percent: None,
|
||||||
|
status: Status::Unknown,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(drive) = pool.drives.iter_mut().find(|d| d.name == drive_name) {
|
||||||
|
if let MetricValue::Float(wear) = metric.value {
|
||||||
|
drive.wear_percent = Some(wear);
|
||||||
|
drive.status = metric.status.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to sorted vec for consistent ordering
|
||||||
|
let mut pool_list: Vec<StoragePool> = pools.into_values().collect();
|
||||||
|
pool_list.sort_by(|a, b| a.name.cmp(&b.name)); // Sort alphabetically by name
|
||||||
|
self.storage_pools = pool_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract pool name from disk metric name
|
||||||
|
fn extract_pool_name(&self, metric_name: &str) -> Option<String> {
|
||||||
|
// Pattern: disk_{pool_name}_{drive_name}_{metric_type}
|
||||||
|
// Since pool_name can contain underscores, work backwards from known metric suffixes
|
||||||
|
if metric_name.starts_with("disk_") {
|
||||||
|
// First try drive-specific metrics that have device names
|
||||||
|
if let Some(suffix_pos) = metric_name.rfind("_temperature")
|
||||||
|
.or_else(|| metric_name.rfind("_wear_percent"))
|
||||||
|
.or_else(|| metric_name.rfind("_health")) {
|
||||||
|
// Find the second-to-last underscore to get pool name
|
||||||
|
let before_suffix = &metric_name[..suffix_pos];
|
||||||
|
if let Some(drive_start) = before_suffix.rfind('_') {
|
||||||
|
return Some(metric_name[5..drive_start].to_string()); // Skip "disk_"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// For pool-level metrics (usage_percent, used_gb, total_gb), take everything before the metric suffix
|
||||||
|
else if let Some(suffix_pos) = metric_name.rfind("_usage_percent")
|
||||||
|
.or_else(|| metric_name.rfind("_used_gb"))
|
||||||
|
.or_else(|| metric_name.rfind("_total_gb")) {
|
||||||
|
return Some(metric_name[5..suffix_pos].to_string()); // Skip "disk_"
|
||||||
|
}
|
||||||
|
// Fallback to old behavior for unknown patterns
|
||||||
|
else if let Some(captures) = metric_name.strip_prefix("disk_") {
|
||||||
|
if let Some(pos) = captures.find('_') {
|
||||||
|
return Some(captures[..pos].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract drive name from disk metric name
|
||||||
|
fn extract_drive_name(&self, metric_name: &str) -> Option<String> {
|
||||||
|
// Pattern: disk_{pool_name}_{drive_name}_{metric_type}
|
||||||
|
// Since pool_name can contain underscores, work backwards from known metric suffixes
|
||||||
|
if metric_name.starts_with("disk_") {
|
||||||
|
if let Some(suffix_pos) = metric_name.rfind("_temperature")
|
||||||
|
.or_else(|| metric_name.rfind("_wear_percent"))
|
||||||
|
.or_else(|| metric_name.rfind("_health")) {
|
||||||
|
// Find the second-to-last underscore to get the drive name
|
||||||
|
let before_suffix = &metric_name[..suffix_pos];
|
||||||
|
if let Some(drive_start) = before_suffix.rfind('_') {
|
||||||
|
return Some(before_suffix[drive_start + 1..].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render storage section with tree structure
|
||||||
|
fn render_storage(&self) -> Vec<Line<'_>> {
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
for pool in &self.storage_pools {
|
||||||
|
// Pool header line
|
||||||
|
let usage_text = match (pool.usage_percent, pool.used_gb, pool.total_gb) {
|
||||||
|
(Some(pct), Some(used), Some(total)) => {
|
||||||
|
format!("{:.0}% {:.1}GB/{:.1}GB", pct, used, total)
|
||||||
|
}
|
||||||
|
_ => "—% —GB/—GB".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let pool_label = if pool.pool_type.to_lowercase() == "single" {
|
||||||
|
format!("{}:", pool.mount_point)
|
||||||
|
} else {
|
||||||
|
format!("{} ({}):", pool.mount_point, pool.pool_type)
|
||||||
|
};
|
||||||
|
let pool_spans = StatusIcons::create_status_spans(
|
||||||
|
pool.status.clone(),
|
||||||
|
&pool_label
|
||||||
|
);
|
||||||
|
lines.push(Line::from(pool_spans));
|
||||||
|
|
||||||
|
// Drive lines with tree structure
|
||||||
|
let has_usage_line = pool.usage_percent.is_some();
|
||||||
|
for (i, drive) in pool.drives.iter().enumerate() {
|
||||||
|
let is_last_drive = i == pool.drives.len() - 1;
|
||||||
|
let tree_symbol = if is_last_drive && !has_usage_line { "└─" } else { "├─" };
|
||||||
|
|
||||||
|
let mut drive_info = Vec::new();
|
||||||
|
if let Some(temp) = drive.temperature {
|
||||||
|
drive_info.push(format!("T: {:.0}C", temp));
|
||||||
|
}
|
||||||
|
if let Some(wear) = drive.wear_percent {
|
||||||
|
drive_info.push(format!("W: {:.0}%", wear));
|
||||||
|
}
|
||||||
|
let drive_text = if drive_info.is_empty() {
|
||||||
|
drive.name.clone()
|
||||||
|
} else {
|
||||||
|
format!("{} {}", drive.name, drive_info.join(" • "))
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut drive_spans = vec![
|
||||||
|
Span::raw(" "),
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
Span::raw(" "),
|
||||||
|
];
|
||||||
|
drive_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
||||||
|
lines.push(Line::from(drive_spans));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Usage line
|
||||||
|
if pool.usage_percent.is_some() {
|
||||||
|
let tree_symbol = "└─";
|
||||||
|
let mut usage_spans = vec![
|
||||||
|
Span::raw(" "),
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
Span::raw(" "),
|
||||||
|
];
|
||||||
|
usage_spans.extend(StatusIcons::create_status_spans(pool.status.clone(), &usage_text));
|
||||||
|
lines.push(Line::from(usage_spans));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Widget for SystemWidget {
|
||||||
|
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||||
|
self.has_data = !metrics.is_empty();
|
||||||
|
|
||||||
|
for metric in metrics {
|
||||||
|
match metric.name.as_str() {
|
||||||
|
// NixOS metrics
|
||||||
|
"system_nixos_build" => {
|
||||||
|
if let MetricValue::String(build) = &metric.value {
|
||||||
|
self.nixos_build = Some(build.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"system_config_hash" => {
|
||||||
|
if let MetricValue::String(hash) = &metric.value {
|
||||||
|
self.config_hash = Some(hash.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"agent_version" => {
|
||||||
|
if let MetricValue::String(version) = &metric.value {
|
||||||
|
self.agent_hash = Some(version.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CPU metrics
|
||||||
|
"cpu_load_1min" => {
|
||||||
|
if let MetricValue::Float(load) = metric.value {
|
||||||
|
self.cpu_load_1min = Some(load);
|
||||||
|
self.cpu_status = metric.status.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"cpu_load_5min" => {
|
||||||
|
if let MetricValue::Float(load) = metric.value {
|
||||||
|
self.cpu_load_5min = Some(load);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"cpu_load_15min" => {
|
||||||
|
if let MetricValue::Float(load) = metric.value {
|
||||||
|
self.cpu_load_15min = Some(load);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"cpu_frequency_mhz" => {
|
||||||
|
if let MetricValue::Float(freq) = metric.value {
|
||||||
|
self.cpu_frequency = Some(freq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Memory metrics
|
||||||
|
"memory_usage_percent" => {
|
||||||
|
if let MetricValue::Float(usage) = metric.value {
|
||||||
|
self.memory_usage_percent = Some(usage);
|
||||||
|
self.memory_status = metric.status.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"memory_used_gb" => {
|
||||||
|
if let MetricValue::Float(used) = metric.value {
|
||||||
|
self.memory_used_gb = Some(used);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"memory_total_gb" => {
|
||||||
|
if let MetricValue::Float(total) = metric.value {
|
||||||
|
self.memory_total_gb = Some(total);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tmpfs metrics
|
||||||
|
"memory_tmp_usage_percent" => {
|
||||||
|
if let MetricValue::Float(usage) = metric.value {
|
||||||
|
self.tmp_usage_percent = Some(usage);
|
||||||
|
self.tmp_status = metric.status.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"memory_tmp_used_gb" => {
|
||||||
|
if let MetricValue::Float(used) = metric.value {
|
||||||
|
self.tmp_used_gb = Some(used);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"memory_tmp_total_gb" => {
|
||||||
|
if let MetricValue::Float(total) = metric.value {
|
||||||
|
self.tmp_total_gb = Some(total);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update storage from all disk metrics
|
||||||
|
self.update_storage_from_metrics(metrics);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemWidget {
|
||||||
|
/// Render with scroll offset support
|
||||||
|
pub fn render_with_scroll(&mut self, frame: &mut Frame, area: Rect, scroll_offset: usize, hostname: &str) {
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
// NixOS section
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(format!("NixOS {}:", hostname), Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
let build_text = self.nixos_build.as_deref().unwrap_or("unknown");
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(format!("Build: {}", build_text), Typography::secondary())
|
||||||
|
]));
|
||||||
|
|
||||||
|
let agent_version_text = self.agent_hash.as_deref().unwrap_or("unknown");
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(format!("Agent: {}", agent_version_text), Typography::secondary())
|
||||||
|
]));
|
||||||
|
|
||||||
|
|
||||||
|
// CPU section
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled("CPU:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
let load_text = self.format_cpu_load();
|
||||||
|
let cpu_spans = StatusIcons::create_status_spans(
|
||||||
|
self.cpu_status.clone(),
|
||||||
|
&format!("Load: {}", load_text)
|
||||||
|
);
|
||||||
|
lines.push(Line::from(cpu_spans));
|
||||||
|
|
||||||
|
let freq_text = self.format_cpu_frequency();
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(" └─ ", Typography::tree()),
|
||||||
|
Span::styled(format!("Freq: {}", freq_text), Typography::secondary())
|
||||||
|
]));
|
||||||
|
|
||||||
|
// RAM section
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled("RAM:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
let memory_text = self.format_memory_usage();
|
||||||
|
let memory_spans = StatusIcons::create_status_spans(
|
||||||
|
self.memory_status.clone(),
|
||||||
|
&format!("Usage: {}", memory_text)
|
||||||
|
);
|
||||||
|
lines.push(Line::from(memory_spans));
|
||||||
|
|
||||||
|
let tmp_text = self.format_tmp_usage();
|
||||||
|
let mut tmp_spans = vec![
|
||||||
|
Span::styled(" └─ ", Typography::tree()),
|
||||||
|
];
|
||||||
|
tmp_spans.extend(StatusIcons::create_status_spans(
|
||||||
|
self.tmp_status.clone(),
|
||||||
|
&format!("/tmp: {}", tmp_text)
|
||||||
|
));
|
||||||
|
lines.push(Line::from(tmp_spans));
|
||||||
|
|
||||||
|
// Storage section
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled("Storage:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
// Storage items with overflow handling
|
||||||
|
let storage_lines = self.render_storage();
|
||||||
|
let remaining_space = area.height.saturating_sub(lines.len() as u16);
|
||||||
|
|
||||||
|
if storage_lines.len() <= remaining_space as usize {
|
||||||
|
// All storage lines fit
|
||||||
|
lines.extend(storage_lines);
|
||||||
|
} else if remaining_space >= 2 {
|
||||||
|
// Show what we can and add overflow indicator
|
||||||
|
let lines_to_show = (remaining_space - 1) as usize; // Reserve 1 line for overflow
|
||||||
|
lines.extend(storage_lines.iter().take(lines_to_show).cloned());
|
||||||
|
|
||||||
|
// Count hidden pools
|
||||||
|
let mut hidden_pools = 0;
|
||||||
|
let mut current_pool = String::new();
|
||||||
|
for (i, line) in storage_lines.iter().enumerate() {
|
||||||
|
if i >= lines_to_show {
|
||||||
|
// Check if this line represents a new pool (no indentation)
|
||||||
|
if let Some(first_span) = line.spans.first() {
|
||||||
|
let text = first_span.content.as_ref();
|
||||||
|
if !text.starts_with(" ") && text.contains(':') {
|
||||||
|
let pool_name = text.split(':').next().unwrap_or("").trim();
|
||||||
|
if pool_name != current_pool {
|
||||||
|
hidden_pools += 1;
|
||||||
|
current_pool = pool_name.to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if hidden_pools > 0 {
|
||||||
|
let overflow_text = format!(
|
||||||
|
"... and {} more pool{}",
|
||||||
|
hidden_pools,
|
||||||
|
if hidden_pools == 1 { "" } else { "s" }
|
||||||
|
);
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(overflow_text, Typography::muted())
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply scroll offset
|
||||||
|
let total_lines = lines.len();
|
||||||
|
let available_height = area.height as usize;
|
||||||
|
|
||||||
|
// Always apply scrolling if scroll_offset > 0, even if content fits
|
||||||
|
if scroll_offset > 0 || total_lines > available_height {
|
||||||
|
let max_scroll = if total_lines > available_height {
|
||||||
|
total_lines - available_height
|
||||||
|
} else {
|
||||||
|
total_lines.saturating_sub(1)
|
||||||
|
};
|
||||||
|
let effective_scroll = scroll_offset.min(max_scroll);
|
||||||
|
|
||||||
|
// Take only the visible portion after scrolling
|
||||||
|
let visible_lines: Vec<Line> = lines
|
||||||
|
.into_iter()
|
||||||
|
.skip(effective_scroll)
|
||||||
|
.take(available_height)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let paragraph = Paragraph::new(Text::from(visible_lines));
|
||||||
|
frame.render_widget(paragraph, area);
|
||||||
|
} else {
|
||||||
|
// All content fits and no scroll offset, render normally
|
||||||
|
let paragraph = Paragraph::new(Text::from(lines));
|
||||||
|
frame.render_widget(paragraph, area);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,9 +1,10 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cm-dashboard-shared"
|
name = "cm-dashboard-shared"
|
||||||
version = "0.1.0"
|
version = "0.1.50"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { workspace = true }
|
||||||
serde_json = "1.0"
|
serde_json = { workspace = true }
|
||||||
chrono = { version = "0.4", features = ["serde"] }
|
chrono = { workspace = true }
|
||||||
|
thiserror = { workspace = true }
|
||||||
16
shared/src/cache.rs
Normal file
16
shared/src/cache.rs
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// Cache configuration
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct CacheConfig {
|
||||||
|
pub persist_path: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CacheConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
persist_path: "/var/lib/cm-dashboard/cache.json".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use serde_json::Value;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
||||||
#[serde(rename_all = "snake_case")]
|
|
||||||
pub enum AgentType {
|
|
||||||
Smart,
|
|
||||||
Service,
|
|
||||||
System,
|
|
||||||
Backup,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct MetricsEnvelope {
|
|
||||||
pub hostname: String,
|
|
||||||
pub agent_type: AgentType,
|
|
||||||
pub timestamp: u64,
|
|
||||||
#[serde(default)]
|
|
||||||
pub metrics: Value,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Alias for backward compatibility
|
|
||||||
pub type MessageEnvelope = MetricsEnvelope;
|
|
||||||
21
shared/src/error.rs
Normal file
21
shared/src/error.rs
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum SharedError {
|
||||||
|
#[error("Serialization error: {message}")]
|
||||||
|
Serialization { message: String },
|
||||||
|
|
||||||
|
#[error("Invalid metric value: {message}")]
|
||||||
|
InvalidMetric { message: String },
|
||||||
|
|
||||||
|
#[error("Protocol error: {message}")]
|
||||||
|
Protocol { message: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<serde_json::Error> for SharedError {
|
||||||
|
fn from(err: serde_json::Error) -> Self {
|
||||||
|
SharedError::Serialization {
|
||||||
|
message: err.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1 +1,9 @@
|
|||||||
pub mod envelope;
|
pub mod cache;
|
||||||
|
pub mod error;
|
||||||
|
pub mod metrics;
|
||||||
|
pub mod protocol;
|
||||||
|
|
||||||
|
pub use cache::*;
|
||||||
|
pub use error::*;
|
||||||
|
pub use metrics::*;
|
||||||
|
pub use protocol::*;
|
||||||
|
|||||||
285
shared/src/metrics.rs
Normal file
285
shared/src/metrics.rs
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
use chrono::Utc;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Individual metric with value, status, and metadata
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct Metric {
|
||||||
|
pub name: String,
|
||||||
|
pub value: MetricValue,
|
||||||
|
pub status: Status,
|
||||||
|
pub timestamp: u64,
|
||||||
|
pub description: Option<String>,
|
||||||
|
pub unit: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Metric {
|
||||||
|
pub fn new(name: String, value: MetricValue, status: Status) -> Self {
|
||||||
|
Self {
|
||||||
|
name,
|
||||||
|
value,
|
||||||
|
status,
|
||||||
|
timestamp: Utc::now().timestamp() as u64,
|
||||||
|
description: None,
|
||||||
|
unit: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_description(mut self, description: String) -> Self {
|
||||||
|
self.description = Some(description);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_unit(mut self, unit: String) -> Self {
|
||||||
|
self.unit = Some(unit);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Typed metric values
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub enum MetricValue {
|
||||||
|
Float(f32),
|
||||||
|
Integer(i64),
|
||||||
|
String(String),
|
||||||
|
Boolean(bool),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricValue {
|
||||||
|
pub fn as_f32(&self) -> Option<f32> {
|
||||||
|
match self {
|
||||||
|
MetricValue::Float(f) => Some(*f),
|
||||||
|
MetricValue::Integer(i) => Some(*i as f32),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_i64(&self) -> Option<i64> {
|
||||||
|
match self {
|
||||||
|
MetricValue::Integer(i) => Some(*i),
|
||||||
|
MetricValue::Float(f) => Some(*f as i64),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_string(&self) -> String {
|
||||||
|
match self {
|
||||||
|
MetricValue::String(s) => s.clone(),
|
||||||
|
MetricValue::Float(f) => f.to_string(),
|
||||||
|
MetricValue::Integer(i) => i.to_string(),
|
||||||
|
MetricValue::Boolean(b) => b.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_bool(&self) -> Option<bool> {
|
||||||
|
match self {
|
||||||
|
MetricValue::Boolean(b) => Some(*b),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Health status for metrics
|
||||||
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
pub enum Status {
|
||||||
|
Ok,
|
||||||
|
Pending,
|
||||||
|
Warning,
|
||||||
|
Critical,
|
||||||
|
Unknown,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Status {
|
||||||
|
/// Aggregate multiple statuses - returns the worst status
|
||||||
|
pub fn aggregate(statuses: &[Status]) -> Status {
|
||||||
|
statuses.iter().max().copied().unwrap_or(Status::Unknown)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Status {
|
||||||
|
fn default() -> Self {
|
||||||
|
Status::Unknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hysteresis thresholds for preventing status flapping
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HysteresisThresholds {
|
||||||
|
/// Warning threshold - trigger warning when value >= this
|
||||||
|
pub warning_high: f32,
|
||||||
|
/// Warning recovery - return to ok when value < this
|
||||||
|
pub warning_low: f32,
|
||||||
|
/// Critical threshold - trigger critical when value >= this
|
||||||
|
pub critical_high: f32,
|
||||||
|
/// Critical recovery - return to warning when value < this
|
||||||
|
pub critical_low: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HysteresisThresholds {
|
||||||
|
pub fn new(warning_high: f32, critical_high: f32) -> Self {
|
||||||
|
// Default hysteresis: 10% gap for recovery
|
||||||
|
let warning_gap = warning_high * 0.1;
|
||||||
|
let critical_gap = critical_high * 0.1;
|
||||||
|
|
||||||
|
Self {
|
||||||
|
warning_high,
|
||||||
|
warning_low: warning_high - warning_gap,
|
||||||
|
critical_high,
|
||||||
|
critical_low: critical_high - critical_gap,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
|
||||||
|
Self {
|
||||||
|
warning_high,
|
||||||
|
warning_low: warning_high - warning_gap,
|
||||||
|
critical_high,
|
||||||
|
critical_low: critical_high - critical_gap,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate status with hysteresis based on current value and previous status
|
||||||
|
pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
|
||||||
|
match previous_status {
|
||||||
|
Status::Ok => {
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value >= self.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Warning => {
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value < self.warning_low {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Warning
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Critical => {
|
||||||
|
if value < self.critical_low {
|
||||||
|
if value < self.warning_low {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Warning
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Status::Critical
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Unknown => {
|
||||||
|
// First measurement, use normal thresholds
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value >= self.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status::Pending => {
|
||||||
|
// Service transitioning, use normal thresholds like first measurement
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value >= self.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Status tracker for hysteresis - tracks previous status per metric
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct StatusTracker {
|
||||||
|
previous_statuses: HashMap<String, Status>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StatusTracker {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get previous status for a metric
|
||||||
|
pub fn get_previous_status(&self, metric_name: &str) -> Status {
|
||||||
|
self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update status for a metric
|
||||||
|
pub fn update_status(&mut self, metric_name: String, status: Status) {
|
||||||
|
self.previous_statuses.insert(metric_name, status);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate status with hysteresis
|
||||||
|
pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
|
||||||
|
let previous = self.get_previous_status(metric_name);
|
||||||
|
let new_status = thresholds.calculate_status(value, previous);
|
||||||
|
self.update_status(metric_name.to_string(), new_status);
|
||||||
|
new_status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metric name registry - constants for all metric names
|
||||||
|
pub mod registry {
|
||||||
|
// CPU metrics
|
||||||
|
pub const CPU_LOAD_1MIN: &str = "cpu_load_1min";
|
||||||
|
pub const CPU_LOAD_5MIN: &str = "cpu_load_5min";
|
||||||
|
pub const CPU_LOAD_15MIN: &str = "cpu_load_15min";
|
||||||
|
pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
|
||||||
|
pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
|
||||||
|
pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";
|
||||||
|
|
||||||
|
// Memory metrics
|
||||||
|
pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
|
||||||
|
pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
|
||||||
|
pub const MEMORY_USED_GB: &str = "memory_used_gb";
|
||||||
|
pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
|
||||||
|
pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
|
||||||
|
pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";
|
||||||
|
|
||||||
|
// Disk metrics (template - actual names include device)
|
||||||
|
pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
|
||||||
|
pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
|
||||||
|
pub const DISK_WEAR_PERCENT_TEMPLATE: &str = "disk_{device}_wear_percent";
|
||||||
|
pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
|
||||||
|
pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
|
||||||
|
pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";
|
||||||
|
|
||||||
|
// Service metrics (template - actual names include service)
|
||||||
|
pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
|
||||||
|
pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
|
||||||
|
pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";
|
||||||
|
|
||||||
|
// Backup metrics
|
||||||
|
pub const BACKUP_STATUS: &str = "backup_status";
|
||||||
|
pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
|
||||||
|
pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
|
||||||
|
pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
|
||||||
|
pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";
|
||||||
|
|
||||||
|
// Network metrics (template - actual names include interface)
|
||||||
|
pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
|
||||||
|
pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
|
||||||
|
pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
|
||||||
|
pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";
|
||||||
|
|
||||||
|
/// Generate disk metric name from template
|
||||||
|
pub fn disk_metric(template: &str, device: &str) -> String {
|
||||||
|
template.replace("{device}", device)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate service metric name from template
|
||||||
|
pub fn service_metric(template: &str, name: &str) -> String {
|
||||||
|
template.replace("{name}", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate network metric name from template
|
||||||
|
pub fn network_metric(template: &str, interface: &str) -> String {
|
||||||
|
template.replace("{interface}", interface)
|
||||||
|
}
|
||||||
|
}
|
||||||
157
shared/src/protocol.rs
Normal file
157
shared/src/protocol.rs
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
use crate::metrics::Metric;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// Message sent from agent to dashboard via ZMQ
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct MetricMessage {
|
||||||
|
pub hostname: String,
|
||||||
|
pub timestamp: u64,
|
||||||
|
pub metrics: Vec<Metric>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Command output streaming message
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CommandOutputMessage {
|
||||||
|
pub hostname: String,
|
||||||
|
pub command_id: String,
|
||||||
|
pub command_type: String,
|
||||||
|
pub output_line: String,
|
||||||
|
pub is_complete: bool,
|
||||||
|
pub timestamp: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricMessage {
|
||||||
|
pub fn new(hostname: String, metrics: Vec<Metric>) -> Self {
|
||||||
|
Self {
|
||||||
|
hostname,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
metrics,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CommandOutputMessage {
|
||||||
|
pub fn new(hostname: String, command_id: String, command_type: String, output_line: String, is_complete: bool) -> Self {
|
||||||
|
Self {
|
||||||
|
hostname,
|
||||||
|
command_id,
|
||||||
|
command_type,
|
||||||
|
output_line,
|
||||||
|
is_complete,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commands that can be sent from dashboard to agent
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub enum Command {
|
||||||
|
/// Request immediate metric refresh
|
||||||
|
RefreshMetrics,
|
||||||
|
/// Request specific metrics by name
|
||||||
|
RequestMetrics { metric_names: Vec<String> },
|
||||||
|
/// Ping command for connection testing
|
||||||
|
Ping,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response from agent to dashboard commands
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub enum CommandResponse {
|
||||||
|
/// Acknowledgment of command
|
||||||
|
Ack,
|
||||||
|
/// Metrics response
|
||||||
|
Metrics(Vec<Metric>),
|
||||||
|
/// Pong response to ping
|
||||||
|
Pong,
|
||||||
|
/// Error response
|
||||||
|
Error { message: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// ZMQ message envelope for routing
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct MessageEnvelope {
|
||||||
|
pub message_type: MessageType,
|
||||||
|
pub payload: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub enum MessageType {
|
||||||
|
Metrics,
|
||||||
|
Command,
|
||||||
|
CommandResponse,
|
||||||
|
CommandOutput,
|
||||||
|
Heartbeat,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MessageEnvelope {
|
||||||
|
pub fn metrics(message: MetricMessage) -> Result<Self, crate::SharedError> {
|
||||||
|
Ok(Self {
|
||||||
|
message_type: MessageType::Metrics,
|
||||||
|
payload: serde_json::to_vec(&message)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn command(command: Command) -> Result<Self, crate::SharedError> {
|
||||||
|
Ok(Self {
|
||||||
|
message_type: MessageType::Command,
|
||||||
|
payload: serde_json::to_vec(&command)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn command_response(response: CommandResponse) -> Result<Self, crate::SharedError> {
|
||||||
|
Ok(Self {
|
||||||
|
message_type: MessageType::CommandResponse,
|
||||||
|
payload: serde_json::to_vec(&response)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn command_output(message: CommandOutputMessage) -> Result<Self, crate::SharedError> {
|
||||||
|
Ok(Self {
|
||||||
|
message_type: MessageType::CommandOutput,
|
||||||
|
payload: serde_json::to_vec(&message)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn heartbeat() -> Result<Self, crate::SharedError> {
|
||||||
|
Ok(Self {
|
||||||
|
message_type: MessageType::Heartbeat,
|
||||||
|
payload: Vec::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decode_metrics(&self) -> Result<MetricMessage, crate::SharedError> {
|
||||||
|
match self.message_type {
|
||||||
|
MessageType::Metrics => Ok(serde_json::from_slice(&self.payload)?),
|
||||||
|
_ => Err(crate::SharedError::Protocol {
|
||||||
|
message: "Expected metrics message".to_string(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decode_command(&self) -> Result<Command, crate::SharedError> {
|
||||||
|
match self.message_type {
|
||||||
|
MessageType::Command => Ok(serde_json::from_slice(&self.payload)?),
|
||||||
|
_ => Err(crate::SharedError::Protocol {
|
||||||
|
message: "Expected command message".to_string(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decode_command_response(&self) -> Result<CommandResponse, crate::SharedError> {
|
||||||
|
match self.message_type {
|
||||||
|
MessageType::CommandResponse => Ok(serde_json::from_slice(&self.payload)?),
|
||||||
|
_ => Err(crate::SharedError::Protocol {
|
||||||
|
message: "Expected command response message".to_string(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decode_command_output(&self) -> Result<CommandOutputMessage, crate::SharedError> {
|
||||||
|
match self.message_type {
|
||||||
|
MessageType::CommandOutput => Ok(serde_json::from_slice(&self.payload)?),
|
||||||
|
_ => Err(crate::SharedError::Protocol {
|
||||||
|
message: "Expected command output message".to_string(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user