Compare commits
67 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bd22ce265b | |||
| bbc8b7b1cb | |||
| 5dd8cadef3 | |||
| fefe30ec51 | |||
| fb40cce748 | |||
| eaa057b284 | |||
| f23a1b5cec | |||
| 3f98f68b51 | |||
| 3d38a7a984 | |||
| b0ee0242bd | |||
| 8f9e9eabca | |||
| 937f4ad427 | |||
| 8aefab83ae | |||
| 748a9f3a3b | |||
| 5c6b11c794 | |||
| 9f0aa5f806 | |||
| fc247bd0ad | |||
| 00fe8c28ab | |||
| fbbb4a4cfb | |||
| 53e1d8bbce | |||
| 1b9fecea98 | |||
| b7ffeaced5 | |||
| 3858309a5d | |||
| df104bf940 | |||
| d5ce36ee18 | |||
| 4f80701671 | |||
| 267654fda4 | |||
| dc1105eefe | |||
| c9d12793ef | |||
| 8f80015273 | |||
| 7a95a9d762 | |||
| 7b11db990c | |||
| 67b59e9551 | |||
| da37e28b6a | |||
| d89b3ac881 | |||
| 7f26991609 | |||
| 75ec190b93 | |||
| eb892096d9 | |||
| c006625a3f | |||
| dcd5fff8c1 | |||
| 9357e5f2a8 | |||
| d164c1da5f | |||
| b120f95f8a | |||
| 66ab7a492d | |||
| 4d615a7f45 | |||
| fd7ad23205 | |||
| 2b2cb2da3e | |||
| 11d1c2dc94 | |||
| bea2d120b5 | |||
| 5394164123 | |||
| 4329cd26e0 | |||
| b85bd6b153 | |||
| c9b2d5e342 | |||
| b2b301332f | |||
| adf3b0f51c | |||
| 41ded0170c | |||
| 9b4191b2c3 | |||
| 53dbb43352 | |||
| ba03623110 | |||
| f24c4ed650 | |||
| 86501fd486 | |||
| 192eea6e0c | |||
| 43fb838c9b | |||
| 54483653f9 | |||
| e47803b705 | |||
| 439d0d9af6 | |||
| 2242b5ddfe |
@@ -113,13 +113,13 @@ jobs:
|
|||||||
NIX_HASH="sha256-$(python3 -c "import base64, binascii; print(base64.b64encode(binascii.unhexlify('$NEW_HASH')).decode())")"
|
NIX_HASH="sha256-$(python3 -c "import base64, binascii; print(base64.b64encode(binascii.unhexlify('$NEW_HASH')).decode())")"
|
||||||
|
|
||||||
# Update the NixOS configuration
|
# Update the NixOS configuration
|
||||||
sed -i "s|version = \"v[^\"]*\"|version = \"$VERSION\"|" hosts/services/cm-dashboard.nix
|
sed -i "s|version = \"v[^\"]*\"|version = \"$VERSION\"|" services/cm-dashboard.nix
|
||||||
sed -i "s|sha256 = \"sha256-[^\"]*\"|sha256 = \"$NIX_HASH\"|" hosts/services/cm-dashboard.nix
|
sed -i "s|sha256 = \"sha256-[^\"]*\"|sha256 = \"$NIX_HASH\"|" services/cm-dashboard.nix
|
||||||
|
|
||||||
# Commit and push changes
|
# Commit and push changes
|
||||||
git config user.name "Gitea Actions"
|
git config user.name "Gitea Actions"
|
||||||
git config user.email "actions@gitea.cmtec.se"
|
git config user.email "actions@gitea.cmtec.se"
|
||||||
git add hosts/services/cm-dashboard.nix
|
git add services/cm-dashboard.nix
|
||||||
git commit -m "Auto-update cm-dashboard to $VERSION
|
git commit -m "Auto-update cm-dashboard to $VERSION
|
||||||
|
|
||||||
- Update version to $VERSION with automated release
|
- Update version to $VERSION with automated release
|
||||||
|
|||||||
221
CLAUDE.md
221
CLAUDE.md
@@ -7,6 +7,7 @@ A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure.
|
|||||||
## Current Features
|
## Current Features
|
||||||
|
|
||||||
### Core Functionality
|
### Core Functionality
|
||||||
|
|
||||||
- **Real-time Monitoring**: CPU, RAM, Storage, and Service status
|
- **Real-time Monitoring**: CPU, RAM, Storage, and Service status
|
||||||
- **Service Management**: Start/stop services with user-stopped tracking
|
- **Service Management**: Start/stop services with user-stopped tracking
|
||||||
- **Multi-host Support**: Monitor multiple servers from single dashboard
|
- **Multi-host Support**: Monitor multiple servers from single dashboard
|
||||||
@@ -14,6 +15,7 @@ A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure.
|
|||||||
- **Backup Monitoring**: Borgbackup status and scheduling
|
- **Backup Monitoring**: Borgbackup status and scheduling
|
||||||
|
|
||||||
### User-Stopped Service Tracking
|
### User-Stopped Service Tracking
|
||||||
|
|
||||||
- Services stopped via dashboard are marked as "user-stopped"
|
- Services stopped via dashboard are marked as "user-stopped"
|
||||||
- User-stopped services report Status::OK instead of Warning
|
- User-stopped services report Status::OK instead of Warning
|
||||||
- Prevents false alerts during intentional maintenance
|
- Prevents false alerts during intentional maintenance
|
||||||
@@ -21,9 +23,11 @@ A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure.
|
|||||||
- Automatic flag clearing when services are restarted via dashboard
|
- Automatic flag clearing when services are restarted via dashboard
|
||||||
|
|
||||||
### Custom Service Logs
|
### Custom Service Logs
|
||||||
|
|
||||||
- Configure service-specific log file paths per host in dashboard config
|
- Configure service-specific log file paths per host in dashboard config
|
||||||
- Press `L` on any service to view custom log files via `tail -f`
|
- Press `L` on any service to view custom log files via `tail -f`
|
||||||
- Configuration format in dashboard config:
|
- Configuration format in dashboard config:
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
[service_logs]
|
[service_logs]
|
||||||
hostname1 = [
|
hostname1 = [
|
||||||
@@ -36,6 +40,7 @@ hostname2 = [
|
|||||||
```
|
```
|
||||||
|
|
||||||
### Service Management
|
### Service Management
|
||||||
|
|
||||||
- **Direct Control**: Arrow keys (↑↓) or vim keys (j/k) navigate services
|
- **Direct Control**: Arrow keys (↑↓) or vim keys (j/k) navigate services
|
||||||
- **Service Actions**:
|
- **Service Actions**:
|
||||||
- `s` - Start service (sends UserStart command)
|
- `s` - Start service (sends UserStart command)
|
||||||
@@ -47,6 +52,7 @@ hostname2 = [
|
|||||||
- **Transitional Icons**: Blue arrows during operations
|
- **Transitional Icons**: Blue arrows during operations
|
||||||
|
|
||||||
### Navigation
|
### Navigation
|
||||||
|
|
||||||
- **Tab**: Switch between hosts
|
- **Tab**: Switch between hosts
|
||||||
- **↑↓ or j/k**: Select services
|
- **↑↓ or j/k**: Select services
|
||||||
- **s**: Start selected service (UserStart)
|
- **s**: Start selected service (UserStart)
|
||||||
@@ -59,18 +65,105 @@ hostname2 = [
|
|||||||
|
|
||||||
## Core Architecture Principles
|
## Core Architecture Principles
|
||||||
|
|
||||||
### Individual Metrics Philosophy
|
### Structured Data Architecture (✅ IMPLEMENTED v0.1.131)
|
||||||
- Agent collects individual metrics, dashboard composes widgets
|
|
||||||
- Each metric collected, transmitted, and stored individually
|
Complete migration from string-based metrics to structured JSON data. Eliminates all string parsing bugs and provides type-safe data access.
|
||||||
- Agent calculates status for each metric using thresholds
|
|
||||||
- Dashboard aggregates individual metric statuses for widget status
|
**Previous (String Metrics):**
|
||||||
|
|
||||||
|
- ❌ Agent sent individual metrics with string names like `disk_nvme0n1_temperature`
|
||||||
|
- ❌ Dashboard parsed metric names with underscore counting and string splitting
|
||||||
|
- ❌ Complex and error-prone metric filtering and extraction logic
|
||||||
|
|
||||||
|
**Current (Structured Data):**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hostname": "cmbox",
|
||||||
|
"agent_version": "v0.1.131",
|
||||||
|
"timestamp": 1763926877,
|
||||||
|
"system": {
|
||||||
|
"cpu": {
|
||||||
|
"load_1min": 3.5,
|
||||||
|
"load_5min": 3.57,
|
||||||
|
"load_15min": 3.58,
|
||||||
|
"frequency_mhz": 1500,
|
||||||
|
"temperature_celsius": 45.2
|
||||||
|
},
|
||||||
|
"memory": {
|
||||||
|
"usage_percent": 25.0,
|
||||||
|
"total_gb": 23.3,
|
||||||
|
"used_gb": 5.9,
|
||||||
|
"swap_total_gb": 10.7,
|
||||||
|
"swap_used_gb": 0.99,
|
||||||
|
"tmpfs": [
|
||||||
|
{
|
||||||
|
"mount": "/tmp",
|
||||||
|
"usage_percent": 15.0,
|
||||||
|
"used_gb": 0.3,
|
||||||
|
"total_gb": 2.0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"storage": {
|
||||||
|
"drives": [
|
||||||
|
{
|
||||||
|
"name": "nvme0n1",
|
||||||
|
"health": "PASSED",
|
||||||
|
"temperature_celsius": 29.0,
|
||||||
|
"wear_percent": 1.0,
|
||||||
|
"filesystems": [
|
||||||
|
{
|
||||||
|
"mount": "/",
|
||||||
|
"usage_percent": 24.0,
|
||||||
|
"used_gb": 224.9,
|
||||||
|
"total_gb": 928.2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pools": [
|
||||||
|
{
|
||||||
|
"name": "srv_media",
|
||||||
|
"mount": "/srv/media",
|
||||||
|
"type": "mergerfs",
|
||||||
|
"health": "healthy",
|
||||||
|
"usage_percent": 63.0,
|
||||||
|
"used_gb": 2355.2,
|
||||||
|
"total_gb": 3686.4,
|
||||||
|
"data_drives": [{ "name": "sdb", "temperature_celsius": 24.0 }],
|
||||||
|
"parity_drives": [{ "name": "sdc", "temperature_celsius": 24.0 }]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"services": [
|
||||||
|
{ "name": "sshd", "status": "active", "memory_mb": 4.5, "disk_gb": 0.0 }
|
||||||
|
],
|
||||||
|
"backup": {
|
||||||
|
"status": "completed",
|
||||||
|
"last_run": 1763920000,
|
||||||
|
"next_scheduled": 1764006400,
|
||||||
|
"total_size_gb": 150.5,
|
||||||
|
"repository_health": "ok"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- ✅ Agent sends structured JSON over ZMQ (no legacy support)
|
||||||
|
- ✅ Type-safe data access: `data.system.storage.drives[0].temperature_celsius`
|
||||||
|
- ✅ Complete metric coverage: CPU, memory, storage, services, backup
|
||||||
|
- ✅ Backward compatibility via bridge conversion to existing UI widgets
|
||||||
|
- ✅ All string parsing bugs eliminated
|
||||||
|
|
||||||
### Maintenance Mode
|
### Maintenance Mode
|
||||||
|
|
||||||
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
||||||
- File presence suppresses all email notifications while continuing monitoring
|
- File presence suppresses all email notifications while continuing monitoring
|
||||||
- Dashboard continues to show real status, only notifications are blocked
|
- Dashboard continues to show real status, only notifications are blocked
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Enable maintenance mode
|
# Enable maintenance mode
|
||||||
touch /tmp/cm-maintenance
|
touch /tmp/cm-maintenance
|
||||||
@@ -87,16 +180,19 @@ rm /tmp/cm-maintenance
|
|||||||
## Development and Deployment Architecture
|
## Development and Deployment Architecture
|
||||||
|
|
||||||
### Development Path
|
### Development Path
|
||||||
|
|
||||||
- **Location:** `~/projects/cm-dashboard`
|
- **Location:** `~/projects/cm-dashboard`
|
||||||
- **Purpose:** Development workflow only - for committing new code
|
- **Purpose:** Development workflow only - for committing new code
|
||||||
- **Access:** Only for developers to commit changes
|
- **Access:** Only for developers to commit changes
|
||||||
|
|
||||||
### Deployment Path
|
### Deployment Path
|
||||||
|
|
||||||
- **Location:** `/var/lib/cm-dashboard/nixos-config`
|
- **Location:** `/var/lib/cm-dashboard/nixos-config`
|
||||||
- **Purpose:** Production deployment only - agent clones/pulls from git
|
- **Purpose:** Production deployment only - agent clones/pulls from git
|
||||||
- **Workflow:** git pull → `/var/lib/cm-dashboard/nixos-config` → nixos-rebuild
|
- **Workflow:** git pull → `/var/lib/cm-dashboard/nixos-config` → nixos-rebuild
|
||||||
|
|
||||||
### Git Flow
|
### Git Flow
|
||||||
|
|
||||||
```
|
```
|
||||||
Development: ~/projects/cm-dashboard → git commit → git push
|
Development: ~/projects/cm-dashboard → git commit → git push
|
||||||
Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||||
@@ -107,6 +203,7 @@ Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
|||||||
CM Dashboard uses automated binary releases instead of source builds.
|
CM Dashboard uses automated binary releases instead of source builds.
|
||||||
|
|
||||||
### Creating New Releases
|
### Creating New Releases
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ~/projects/cm-dashboard
|
cd ~/projects/cm-dashboard
|
||||||
git tag v0.1.X
|
git tag v0.1.X
|
||||||
@@ -114,11 +211,13 @@ git push origin v0.1.X
|
|||||||
```
|
```
|
||||||
|
|
||||||
This automatically:
|
This automatically:
|
||||||
|
|
||||||
- Builds static binaries with `RUSTFLAGS="-C target-feature=+crt-static"`
|
- Builds static binaries with `RUSTFLAGS="-C target-feature=+crt-static"`
|
||||||
- Creates GitHub-style release with tarball
|
- Creates GitHub-style release with tarball
|
||||||
- Uploads binaries via Gitea API
|
- Uploads binaries via Gitea API
|
||||||
|
|
||||||
### NixOS Configuration Updates
|
### NixOS Configuration Updates
|
||||||
|
|
||||||
Edit `~/projects/nixosbox/hosts/services/cm-dashboard.nix`:
|
Edit `~/projects/nixosbox/hosts/services/cm-dashboard.nix`:
|
||||||
|
|
||||||
```nix
|
```nix
|
||||||
@@ -130,6 +229,7 @@ src = pkgs.fetchurl {
|
|||||||
```
|
```
|
||||||
|
|
||||||
### Get Release Hash
|
### Get Release Hash
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ~/projects/nixosbox
|
cd ~/projects/nixosbox
|
||||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||||
@@ -141,6 +241,7 @@ nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
|||||||
### Building
|
### Building
|
||||||
|
|
||||||
**Testing & Building:**
|
**Testing & Building:**
|
||||||
|
|
||||||
- **Workspace builds**: `nix-shell -p openssl pkg-config --run "cargo build --workspace"`
|
- **Workspace builds**: `nix-shell -p openssl pkg-config --run "cargo build --workspace"`
|
||||||
- **Clean compilation**: Remove `target/` between major changes
|
- **Clean compilation**: Remove `target/` between major changes
|
||||||
|
|
||||||
@@ -153,6 +254,7 @@ The dashboard uses automatic storage discovery to eliminate manual configuration
|
|||||||
### Discovery Process
|
### Discovery Process
|
||||||
|
|
||||||
**At Agent Startup:**
|
**At Agent Startup:**
|
||||||
|
|
||||||
1. Parse `/proc/mounts` to identify all mounted filesystems
|
1. Parse `/proc/mounts` to identify all mounted filesystems
|
||||||
2. Detect MergerFS pools by analyzing `fuse.mergerfs` mount sources
|
2. Detect MergerFS pools by analyzing `fuse.mergerfs` mount sources
|
||||||
3. Identify member disks and potential parity relationships via heuristics
|
3. Identify member disks and potential parity relationships via heuristics
|
||||||
@@ -160,6 +262,7 @@ The dashboard uses automatic storage discovery to eliminate manual configuration
|
|||||||
5. Generate pool-aware metrics with hierarchical relationships
|
5. Generate pool-aware metrics with hierarchical relationships
|
||||||
|
|
||||||
**Continuous Monitoring:**
|
**Continuous Monitoring:**
|
||||||
|
|
||||||
- Use stored discovery data for efficient metric collection
|
- Use stored discovery data for efficient metric collection
|
||||||
- Monitor individual drives for SMART data, temperature, wear
|
- Monitor individual drives for SMART data, temperature, wear
|
||||||
- Calculate pool-level health based on member drive status
|
- Calculate pool-level health based on member drive status
|
||||||
@@ -168,11 +271,13 @@ The dashboard uses automatic storage discovery to eliminate manual configuration
|
|||||||
### Supported Storage Types
|
### Supported Storage Types
|
||||||
|
|
||||||
**Single Disks:**
|
**Single Disks:**
|
||||||
|
|
||||||
- ext4, xfs, btrfs mounted directly
|
- ext4, xfs, btrfs mounted directly
|
||||||
- Individual drive monitoring with SMART data
|
- Individual drive monitoring with SMART data
|
||||||
- Traditional single-disk display for root, boot, etc.
|
- Traditional single-disk display for root, boot, etc.
|
||||||
|
|
||||||
**MergerFS Pools:**
|
**MergerFS Pools:**
|
||||||
|
|
||||||
- Auto-detect from `/proc/mounts` fuse.mergerfs entries
|
- Auto-detect from `/proc/mounts` fuse.mergerfs entries
|
||||||
- Parse source paths to identify member disks (e.g., "/mnt/disk1:/mnt/disk2")
|
- Parse source paths to identify member disks (e.g., "/mnt/disk1:/mnt/disk2")
|
||||||
- Heuristic parity disk detection (sequential device names, "parity" in path)
|
- Heuristic parity disk detection (sequential device names, "parity" in path)
|
||||||
@@ -180,6 +285,7 @@ The dashboard uses automatic storage discovery to eliminate manual configuration
|
|||||||
- Hierarchical tree display with data/parity disk grouping
|
- Hierarchical tree display with data/parity disk grouping
|
||||||
|
|
||||||
**Future Extensions Ready:**
|
**Future Extensions Ready:**
|
||||||
|
|
||||||
- RAID arrays via `/proc/mdstat` parsing
|
- RAID arrays via `/proc/mdstat` parsing
|
||||||
- ZFS pools via `zpool status` integration
|
- ZFS pools via `zpool status` integration
|
||||||
- LVM logical volumes via `lvs` discovery
|
- LVM logical volumes via `lvs` discovery
|
||||||
@@ -198,76 +304,35 @@ exclude_fs_types = ["tmpfs", "devtmpfs", "sysfs", "proc"]
|
|||||||
### Display Format
|
### Display Format
|
||||||
|
|
||||||
```
|
```
|
||||||
|
Network:
|
||||||
|
● eno1:
|
||||||
|
├─ ip: 192.168.30.105
|
||||||
|
└─ tailscale0: 100.125.108.16
|
||||||
|
● eno2:
|
||||||
|
└─ ip: 192.168.32.105
|
||||||
|
CPU:
|
||||||
|
● Load: 0.23 0.21 0.13
|
||||||
|
└─ Freq: 1048 MHz
|
||||||
|
RAM:
|
||||||
|
● Usage: 25% 5.8GB/23.3GB
|
||||||
|
├─ ● /tmp: 2% 0.5GB/2GB
|
||||||
|
└─ ● /var/tmp: 0% 0GB/1.0GB
|
||||||
Storage:
|
Storage:
|
||||||
● /srv/media (mergerfs (2+1)):
|
● 844B9A25 T: 25C W: 4%
|
||||||
├─ Pool Status: ● Healthy (3 drives)
|
├─ ● /: 55% 250.5GB/456.4GB
|
||||||
├─ Total: ● 63% 2355.2GB/3686.4GB
|
└─ ● /boot: 26% 0.3GB/1.0GB
|
||||||
├─ Data Disks:
|
● mergerfs /srv/media:
|
||||||
│ ├─ ● sdb T: 24°C
|
├─ ● 63% 2355.2GB/3686.4GB
|
||||||
│ └─ ● sdd T: 27°C
|
├─ ● Data_1: WDZQ8H8D T: 28°C
|
||||||
└─ Parity: ● sdc T: 24°C
|
├─ ● Data_2: GGA04461 T: 28°C
|
||||||
● /:
|
└─ ● Parity: WDZS8RY0 T: 29°C
|
||||||
├─ ● nvme0n1 W: 13%
|
Backup:
|
||||||
└─ ● 7% 14.5GB/218.5GB
|
● WD-WCC7K1234567 T: 32°C W: 12%
|
||||||
|
├─ Last: 2h ago (12.3GB)
|
||||||
|
├─ Next: in 22h
|
||||||
|
└─ ● Usage: 45% 678GB/1.5TB
|
||||||
```
|
```
|
||||||
|
|
||||||
### Implementation Benefits
|
|
||||||
|
|
||||||
- **Zero Configuration**: No manual pool definitions required
|
|
||||||
- **Always Accurate**: Reflects actual system state automatically
|
|
||||||
- **Scales Automatically**: Handles any number of pools without config changes
|
|
||||||
- **Backwards Compatible**: Single disks continue working unchanged
|
|
||||||
- **Future Ready**: Easy extension for additional storage technologies
|
|
||||||
|
|
||||||
### Current Status (v0.1.100)
|
|
||||||
|
|
||||||
**✅ Completed:**
|
|
||||||
- Auto-discovery system implemented and deployed
|
|
||||||
- `/proc/mounts` parsing with smart heuristics for parity detection
|
|
||||||
- Storage topology stored at agent startup for efficient monitoring
|
|
||||||
- Universal zero-configuration for all hosts (cmbox, steambox, simonbox, srv01, srv02, srv03)
|
|
||||||
- Enhanced pool health calculation (healthy/degraded/critical)
|
|
||||||
- Hierarchical tree visualization with data/parity disk separation
|
|
||||||
|
|
||||||
**🔄 In Progress - Complete Disk Collector Rewrite:**
|
|
||||||
|
|
||||||
The current disk collector has grown complex with mixed legacy/auto-discovery approaches. Planning complete rewrite with clean, simple workflow supporting both physical drives and mergerfs pools.
|
|
||||||
|
|
||||||
**New Clean Architecture:**
|
|
||||||
|
|
||||||
**Discovery Workflow:**
|
|
||||||
1. **`lsblk`** to detect all mount points and backing devices
|
|
||||||
2. **`df`** to get filesystem usage for each mount point
|
|
||||||
3. **Group by physical drive** (nvme0n1, sda, etc.)
|
|
||||||
4. **Parse `/proc/mounts`** for mergerfs pools
|
|
||||||
5. **Generate unified metrics** for both storage types
|
|
||||||
|
|
||||||
**Physical Drive Display:**
|
|
||||||
```
|
|
||||||
● nvme0n1:
|
|
||||||
├─ ● Drive: T: 35°C W: 1%
|
|
||||||
├─ ● Total: 23% 218.0GB/928.2GB
|
|
||||||
├─ ● /boot: 11% 0.1GB/1.0GB
|
|
||||||
└─ ● /: 23% 214.9GB/928.2GB
|
|
||||||
```
|
|
||||||
|
|
||||||
**MergerFS Pool Display:**
|
|
||||||
```
|
|
||||||
● /srv/media (mergerfs):
|
|
||||||
├─ ● Pool: 63% 2355.2GB/3686.4GB
|
|
||||||
├─ Data Disks:
|
|
||||||
│ ├─ ● sdb T: 24°C
|
|
||||||
│ └─ ● sdd T: 27°C
|
|
||||||
└─ ● sdc T: 24°C (parity)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation Benefits:**
|
|
||||||
- **Pure auto-discovery**: No configuration needed
|
|
||||||
- **Clean code paths**: Single workflow for all storage types
|
|
||||||
- **Consistent display**: Status icons on every line, no redundant text
|
|
||||||
- **Simple pipeline**: lsblk → df → group → metrics
|
|
||||||
- **Support for both**: Physical drives and mergerfs pools
|
|
||||||
|
|
||||||
## Important Communication Guidelines
|
## Important Communication Guidelines
|
||||||
|
|
||||||
Keep responses concise and focused. Avoid extensive implementation summaries unless requested.
|
Keep responses concise and focused. Avoid extensive implementation summaries unless requested.
|
||||||
@@ -275,17 +340,20 @@ Keep responses concise and focused. Avoid extensive implementation summaries unl
|
|||||||
## Commit Message Guidelines
|
## Commit Message Guidelines
|
||||||
|
|
||||||
**NEVER mention:**
|
**NEVER mention:**
|
||||||
|
|
||||||
- Claude or any AI assistant names
|
- Claude or any AI assistant names
|
||||||
- Automation or AI-generated content
|
- Automation or AI-generated content
|
||||||
- Any reference to automated code generation
|
- Any reference to automated code generation
|
||||||
|
|
||||||
**ALWAYS:**
|
**ALWAYS:**
|
||||||
|
|
||||||
- Focus purely on technical changes and their purpose
|
- Focus purely on technical changes and their purpose
|
||||||
- Use standard software development commit message format
|
- Use standard software development commit message format
|
||||||
- Describe what was changed and why, not how it was created
|
- Describe what was changed and why, not how it was created
|
||||||
- Write from the perspective of a human developer
|
- Write from the perspective of a human developer
|
||||||
|
|
||||||
**Examples:**
|
**Examples:**
|
||||||
|
|
||||||
- ❌ "Generated with Claude Code"
|
- ❌ "Generated with Claude Code"
|
||||||
- ❌ "AI-assisted implementation"
|
- ❌ "AI-assisted implementation"
|
||||||
- ❌ "Automated refactoring"
|
- ❌ "Automated refactoring"
|
||||||
@@ -295,12 +363,12 @@ Keep responses concise and focused. Avoid extensive implementation summaries unl
|
|||||||
|
|
||||||
## Implementation Rules
|
## Implementation Rules
|
||||||
|
|
||||||
1. **Individual Metrics**: Each metric is collected, transmitted, and stored individually
|
1. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
||||||
2. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
2. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
||||||
3. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
3. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
||||||
4. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
|
||||||
|
|
||||||
**NEVER:**
|
**NEVER:**
|
||||||
|
|
||||||
- Copy/paste ANY code from legacy implementations
|
- Copy/paste ANY code from legacy implementations
|
||||||
- Calculate status in dashboard widgets
|
- Calculate status in dashboard widgets
|
||||||
- Hardcode metric names in widgets (use const arrays)
|
- Hardcode metric names in widgets (use const arrays)
|
||||||
@@ -308,6 +376,7 @@ Keep responses concise and focused. Avoid extensive implementation summaries unl
|
|||||||
- Create documentation files unless explicitly requested
|
- Create documentation files unless explicitly requested
|
||||||
|
|
||||||
**ALWAYS:**
|
**ALWAYS:**
|
||||||
|
|
||||||
- Prefer editing existing files to creating new ones
|
- Prefer editing existing files to creating new ones
|
||||||
- Follow existing code conventions and patterns
|
- Follow existing code conventions and patterns
|
||||||
- Use existing libraries and utilities
|
- Use existing libraries and utilities
|
||||||
|
|||||||
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cm-dashboard"
|
name = "cm-dashboard"
|
||||||
version = "0.1.118"
|
version = "0.1.180"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"chrono",
|
"chrono",
|
||||||
@@ -301,7 +301,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cm-dashboard-agent"
|
name = "cm-dashboard-agent"
|
||||||
version = "0.1.118"
|
version = "0.1.180"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
@@ -324,7 +324,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cm-dashboard-shared"
|
name = "cm-dashboard-shared"
|
||||||
version = "0.1.118"
|
version = "0.1.180"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"chrono",
|
"chrono",
|
||||||
"serde",
|
"serde",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cm-dashboard-agent"
|
name = "cm-dashboard-agent"
|
||||||
version = "0.1.118"
|
version = "0.1.181"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|||||||
@@ -6,20 +6,35 @@ use tracing::{debug, error, info};
|
|||||||
|
|
||||||
use crate::communication::{AgentCommand, ZmqHandler};
|
use crate::communication::{AgentCommand, ZmqHandler};
|
||||||
use crate::config::AgentConfig;
|
use crate::config::AgentConfig;
|
||||||
use crate::metrics::MetricCollectionManager;
|
use crate::collectors::{
|
||||||
|
Collector,
|
||||||
|
backup::BackupCollector,
|
||||||
|
cpu::CpuCollector,
|
||||||
|
disk::DiskCollector,
|
||||||
|
memory::MemoryCollector,
|
||||||
|
network::NetworkCollector,
|
||||||
|
nixos::NixOSCollector,
|
||||||
|
systemd::SystemdCollector,
|
||||||
|
};
|
||||||
use crate::notifications::NotificationManager;
|
use crate::notifications::NotificationManager;
|
||||||
use crate::service_tracker::UserStoppedServiceTracker;
|
use cm_dashboard_shared::AgentData;
|
||||||
use crate::status::HostStatusManager;
|
|
||||||
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
|
|
||||||
|
|
||||||
pub struct Agent {
|
pub struct Agent {
|
||||||
hostname: String,
|
hostname: String,
|
||||||
config: AgentConfig,
|
config: AgentConfig,
|
||||||
zmq_handler: ZmqHandler,
|
zmq_handler: ZmqHandler,
|
||||||
metric_manager: MetricCollectionManager,
|
collectors: Vec<Box<dyn Collector>>,
|
||||||
notification_manager: NotificationManager,
|
notification_manager: NotificationManager,
|
||||||
host_status_manager: HostStatusManager,
|
previous_status: Option<SystemStatus>,
|
||||||
service_tracker: UserStoppedServiceTracker,
|
}
|
||||||
|
|
||||||
|
/// Track system component status for change detection
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct SystemStatus {
|
||||||
|
cpu_load_status: cm_dashboard_shared::Status,
|
||||||
|
cpu_temperature_status: cm_dashboard_shared::Status,
|
||||||
|
memory_usage_status: cm_dashboard_shared::Status,
|
||||||
|
// Add more as needed
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Agent {
|
impl Agent {
|
||||||
@@ -40,76 +55,84 @@ impl Agent {
|
|||||||
config.zmq.publisher_port
|
config.zmq.publisher_port
|
||||||
);
|
);
|
||||||
|
|
||||||
// Initialize metric collection manager with cache config
|
// Initialize collectors
|
||||||
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
let mut collectors: Vec<Box<dyn Collector>> = Vec::new();
|
||||||
info!("Metric collection manager initialized");
|
|
||||||
|
// Add enabled collectors
|
||||||
|
if config.collectors.cpu.enabled {
|
||||||
|
collectors.push(Box::new(CpuCollector::new(config.collectors.cpu.clone())));
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.memory.enabled {
|
||||||
|
collectors.push(Box::new(MemoryCollector::new(config.collectors.memory.clone())));
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.disk.enabled {
|
||||||
|
collectors.push(Box::new(DiskCollector::new(config.collectors.disk.clone())));
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.systemd.enabled {
|
||||||
|
collectors.push(Box::new(SystemdCollector::new(config.collectors.systemd.clone())));
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.backup.enabled {
|
||||||
|
collectors.push(Box::new(BackupCollector::new()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.network.enabled {
|
||||||
|
collectors.push(Box::new(NetworkCollector::new(config.collectors.network.clone())));
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.collectors.nixos.enabled {
|
||||||
|
collectors.push(Box::new(NixOSCollector::new(config.collectors.nixos.clone())));
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Initialized {} collectors", collectors.len());
|
||||||
|
|
||||||
// Initialize notification manager
|
// Initialize notification manager
|
||||||
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
|
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
|
||||||
info!("Notification manager initialized");
|
info!("Notification manager initialized");
|
||||||
|
|
||||||
// Initialize host status manager
|
|
||||||
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
|
|
||||||
info!("Host status manager initialized");
|
|
||||||
|
|
||||||
// Initialize user-stopped service tracker
|
|
||||||
let service_tracker = UserStoppedServiceTracker::init_global()?;
|
|
||||||
info!("User-stopped service tracker initialized");
|
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
hostname,
|
hostname,
|
||||||
config,
|
config,
|
||||||
zmq_handler,
|
zmq_handler,
|
||||||
metric_manager,
|
collectors,
|
||||||
notification_manager,
|
notification_manager,
|
||||||
host_status_manager,
|
previous_status: None,
|
||||||
service_tracker,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Main agent loop with structured data collection
|
||||||
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
|
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
|
||||||
info!("Starting agent main loop with separated collection and transmission");
|
info!("Starting agent main loop");
|
||||||
|
|
||||||
// CRITICAL: Collect ALL data immediately at startup before entering the loop
|
// Initial collection
|
||||||
info!("Performing initial FORCE collection of all metrics at startup");
|
if let Err(e) = self.collect_and_broadcast().await {
|
||||||
if let Err(e) = self.collect_all_metrics_force().await {
|
error!("Initial metric collection failed: {}", e);
|
||||||
error!("Failed to collect initial metrics: {}", e);
|
|
||||||
} else {
|
|
||||||
info!("Initial metric collection completed - all data cached and ready");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Separate intervals for collection, transmission, heartbeat, and email notifications
|
// Set up intervals
|
||||||
let mut collection_interval =
|
let mut transmission_interval = interval(Duration::from_secs(
|
||||||
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
self.config.collection_interval_seconds,
|
||||||
let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds));
|
));
|
||||||
let mut heartbeat_interval = interval(Duration::from_secs(self.config.zmq.heartbeat_interval_seconds));
|
let mut notification_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
||||||
let mut notification_interval = interval(Duration::from_secs(self.config.notifications.aggregation_interval_seconds));
|
|
||||||
|
// Skip initial ticks to avoid immediate execution
|
||||||
|
transmission_interval.tick().await;
|
||||||
|
notification_interval.tick().await;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = collection_interval.tick() => {
|
|
||||||
// Only collect and cache metrics, no ZMQ transmission
|
|
||||||
if let Err(e) = self.collect_metrics_only().await {
|
|
||||||
error!("Failed to collect metrics: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ = transmission_interval.tick() => {
|
_ = transmission_interval.tick() => {
|
||||||
// Send all metrics via ZMQ (dashboard updates only)
|
if let Err(e) = self.collect_and_broadcast().await {
|
||||||
if let Err(e) = self.broadcast_all_metrics().await {
|
error!("Failed to collect and broadcast metrics: {}", e);
|
||||||
error!("Failed to broadcast metrics: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ = heartbeat_interval.tick() => {
|
|
||||||
// Send standalone heartbeat for host connectivity detection
|
|
||||||
if let Err(e) = self.send_heartbeat().await {
|
|
||||||
error!("Failed to send heartbeat: {}", e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ = notification_interval.tick() => {
|
_ = notification_interval.tick() => {
|
||||||
// Process batched email notifications (separate from dashboard updates)
|
// Process any pending notifications
|
||||||
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
// NOTE: With structured data, we might need to implement status tracking differently
|
||||||
error!("Failed to process pending notifications: {}", e);
|
// For now, we skip this until status evaluation is migrated
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Handle incoming commands (check periodically)
|
// Handle incoming commands (check periodically)
|
||||||
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
||||||
@@ -128,225 +151,144 @@ impl Agent {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect_all_metrics_force(&mut self) -> Result<()> {
|
/// Collect structured data from all collectors and broadcast via ZMQ
|
||||||
info!("Starting FORCE metric collection for startup");
|
async fn collect_and_broadcast(&mut self) -> Result<()> {
|
||||||
|
debug!("Starting structured data collection");
|
||||||
|
|
||||||
// Force collect all metrics from all collectors immediately
|
// Initialize empty AgentData
|
||||||
let metrics = self.metric_manager.collect_all_metrics_force().await?;
|
let mut agent_data = AgentData::new(self.hostname.clone(), env!("CARGO_PKG_VERSION").to_string());
|
||||||
|
|
||||||
if metrics.is_empty() {
|
// Collect data from all collectors
|
||||||
error!("No metrics collected during force collection!");
|
for collector in &self.collectors {
|
||||||
return Ok(());
|
if let Err(e) = collector.collect_structured(&mut agent_data).await {
|
||||||
|
error!("Collector failed: {}", e);
|
||||||
|
// Continue with other collectors even if one fails
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Force collected and cached {} metrics", metrics.len());
|
// Check for status changes and send notifications
|
||||||
|
if let Err(e) = self.check_status_changes_and_notify(&agent_data).await {
|
||||||
|
error!("Failed to check status changes: {}", e);
|
||||||
|
}
|
||||||
|
|
||||||
// Process metrics through status manager (collect status data at startup)
|
// Broadcast the structured data via ZMQ
|
||||||
let _status_changed = self.process_metrics(&metrics).await;
|
if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await {
|
||||||
|
error!("Failed to broadcast agent data: {}", e);
|
||||||
|
} else {
|
||||||
|
debug!("Successfully broadcast structured agent data");
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect_metrics_only(&mut self) -> Result<()> {
|
/// Check for status changes and send notifications
|
||||||
debug!("Starting metric collection cycle (cache only)");
|
async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> {
|
||||||
|
// Extract current status
|
||||||
|
let current_status = SystemStatus {
|
||||||
|
cpu_load_status: agent_data.system.cpu.load_status.clone(),
|
||||||
|
cpu_temperature_status: agent_data.system.cpu.temperature_status.clone(),
|
||||||
|
memory_usage_status: agent_data.system.memory.usage_status.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
// Collect all metrics from all collectors and cache them
|
// Check for status changes
|
||||||
let metrics = self.metric_manager.collect_all_metrics().await?;
|
if let Some(previous) = self.previous_status.clone() {
|
||||||
|
self.check_and_notify_status_change(
|
||||||
|
"CPU Load",
|
||||||
|
&previous.cpu_load_status,
|
||||||
|
¤t_status.cpu_load_status,
|
||||||
|
format!("CPU load: {:.1}", agent_data.system.cpu.load_1min)
|
||||||
|
).await?;
|
||||||
|
|
||||||
if metrics.is_empty() {
|
self.check_and_notify_status_change(
|
||||||
debug!("No metrics collected this cycle");
|
"CPU Temperature",
|
||||||
return Ok(());
|
&previous.cpu_temperature_status,
|
||||||
|
¤t_status.cpu_temperature_status,
|
||||||
|
format!("CPU temperature: {}°C",
|
||||||
|
agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32)
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
self.check_and_notify_status_change(
|
||||||
|
"Memory Usage",
|
||||||
|
&previous.memory_usage_status,
|
||||||
|
¤t_status.memory_usage_status,
|
||||||
|
format!("Memory usage: {:.1}%", agent_data.system.memory.usage_percent)
|
||||||
|
).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("Collected and cached {} metrics", metrics.len());
|
// Store current status for next comparison
|
||||||
|
self.previous_status = Some(current_status);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// Process metrics through status manager and trigger immediate transmission if status changed
|
/// Check individual status change and send notification if degraded
|
||||||
let status_changed = self.process_metrics(&metrics).await;
|
async fn check_and_notify_status_change(
|
||||||
|
&mut self,
|
||||||
|
component: &str,
|
||||||
|
previous: &cm_dashboard_shared::Status,
|
||||||
|
current: &cm_dashboard_shared::Status,
|
||||||
|
details: String
|
||||||
|
) -> Result<()> {
|
||||||
|
use cm_dashboard_shared::Status;
|
||||||
|
|
||||||
if status_changed {
|
// Only notify on status degradation (OK → Warning/Critical, Warning → Critical)
|
||||||
info!("Status change detected - triggering immediate metric transmission");
|
let should_notify = match (previous, current) {
|
||||||
if let Err(e) = self.broadcast_all_metrics().await {
|
(Status::Ok, Status::Warning) => true,
|
||||||
error!("Failed to broadcast metrics after status change: {}", e);
|
(Status::Ok, Status::Critical) => true,
|
||||||
|
(Status::Warning, Status::Critical) => true,
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
if should_notify {
|
||||||
|
let subject = format!("{} {} Alert", self.hostname, component);
|
||||||
|
let body = format!(
|
||||||
|
"Alert: {} status changed from {:?} to {:?}\n\nDetails: {}\n\nTime: {}",
|
||||||
|
component,
|
||||||
|
previous,
|
||||||
|
current,
|
||||||
|
details,
|
||||||
|
chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
);
|
||||||
|
|
||||||
|
info!("Sending notification: {} - {:?} → {:?}", component, previous, current);
|
||||||
|
|
||||||
|
if let Err(e) = self.notification_manager.send_direct_email(&subject, &body).await {
|
||||||
|
error!("Failed to send notification for {}: {}", component, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn broadcast_all_metrics(&mut self) -> Result<()> {
|
/// Handle incoming commands from dashboard
|
||||||
debug!("Broadcasting cached metrics via ZMQ");
|
|
||||||
|
|
||||||
// Get cached metrics (no fresh collection)
|
|
||||||
let mut metrics = self.metric_manager.get_cached_metrics();
|
|
||||||
|
|
||||||
// Add the host status summary metric from status manager
|
|
||||||
let host_status_metric = self.host_status_manager.get_host_status_metric();
|
|
||||||
metrics.push(host_status_metric);
|
|
||||||
|
|
||||||
// Add agent version metric for cross-host version comparison
|
|
||||||
let version_metric = self.get_agent_version_metric();
|
|
||||||
metrics.push(version_metric);
|
|
||||||
|
|
||||||
// Add heartbeat metric for host connectivity detection
|
|
||||||
let heartbeat_metric = self.get_heartbeat_metric();
|
|
||||||
metrics.push(heartbeat_metric);
|
|
||||||
|
|
||||||
// Check for user-stopped services that are now active and clear their flags
|
|
||||||
self.clear_user_stopped_flags_for_active_services(&metrics);
|
|
||||||
|
|
||||||
if metrics.is_empty() {
|
|
||||||
debug!("No metrics to broadcast");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("Broadcasting {} cached metrics (including host status summary)", metrics.len());
|
|
||||||
|
|
||||||
// Create and send message with all current data
|
|
||||||
let message = MetricMessage::new(self.hostname.clone(), metrics);
|
|
||||||
self.zmq_handler.publish_metrics(&message).await?;
|
|
||||||
|
|
||||||
debug!("Metrics broadcasted successfully");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn process_metrics(&mut self, metrics: &[Metric]) -> bool {
|
|
||||||
let mut status_changed = false;
|
|
||||||
for metric in metrics {
|
|
||||||
// Filter excluded metrics from email notification processing only
|
|
||||||
if self.config.notifications.exclude_email_metrics.contains(&metric.name) {
|
|
||||||
debug!("Excluding metric '{}' from email notification processing", metric.name);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.host_status_manager.process_metric(metric, &mut self.notification_manager).await {
|
|
||||||
status_changed = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
status_changed
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create agent version metric for cross-host version comparison
|
|
||||||
fn get_agent_version_metric(&self) -> Metric {
|
|
||||||
// Get version from executable path (same logic as main.rs get_version)
|
|
||||||
let version = self.get_agent_version();
|
|
||||||
|
|
||||||
Metric::new(
|
|
||||||
"agent_version".to_string(),
|
|
||||||
MetricValue::String(version),
|
|
||||||
Status::Ok,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get agent version from Cargo package version
|
|
||||||
fn get_agent_version(&self) -> String {
|
|
||||||
// Use the version from Cargo.toml (e.g., "0.1.11")
|
|
||||||
format!("v{}", env!("CARGO_PKG_VERSION"))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create heartbeat metric for host connectivity detection
|
|
||||||
fn get_heartbeat_metric(&self) -> Metric {
|
|
||||||
use std::time::{SystemTime, UNIX_EPOCH};
|
|
||||||
|
|
||||||
let timestamp = SystemTime::now()
|
|
||||||
.duration_since(UNIX_EPOCH)
|
|
||||||
.unwrap()
|
|
||||||
.as_secs();
|
|
||||||
|
|
||||||
Metric::new(
|
|
||||||
"agent_heartbeat".to_string(),
|
|
||||||
MetricValue::Integer(timestamp as i64),
|
|
||||||
Status::Ok,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Send standalone heartbeat for connectivity detection
|
|
||||||
async fn send_heartbeat(&mut self) -> Result<()> {
|
|
||||||
let heartbeat_metric = self.get_heartbeat_metric();
|
|
||||||
let message = MetricMessage::new(
|
|
||||||
self.hostname.clone(),
|
|
||||||
vec![heartbeat_metric],
|
|
||||||
);
|
|
||||||
|
|
||||||
self.zmq_handler.publish_metrics(&message).await?;
|
|
||||||
debug!("Sent standalone heartbeat for connectivity detection");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_commands(&mut self) -> Result<()> {
|
async fn handle_commands(&mut self) -> Result<()> {
|
||||||
// Try to receive commands (non-blocking)
|
// Try to receive a command (non-blocking)
|
||||||
match self.zmq_handler.try_receive_command() {
|
if let Ok(Some(command)) = self.zmq_handler.try_receive_command() {
|
||||||
Ok(Some(command)) => {
|
info!("Received command: {:?}", command);
|
||||||
info!("Received command: {:?}", command);
|
|
||||||
self.process_command(command).await?;
|
|
||||||
}
|
|
||||||
Ok(None) => {
|
|
||||||
// No command available - this is normal
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
error!("Error receiving command: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
|
match command {
|
||||||
match command {
|
AgentCommand::CollectNow => {
|
||||||
AgentCommand::CollectNow => {
|
info!("Received immediate collection request");
|
||||||
info!("Processing CollectNow command");
|
if let Err(e) = self.collect_and_broadcast().await {
|
||||||
if let Err(e) = self.collect_metrics_only().await {
|
error!("Failed to collect on demand: {}", e);
|
||||||
error!("Failed to collect metrics on command: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
AgentCommand::SetInterval { seconds } => {
|
|
||||||
info!("Processing SetInterval command: {} seconds", seconds);
|
|
||||||
// Note: This would require modifying the interval, which is complex
|
|
||||||
// For now, just log the request
|
|
||||||
info!("Interval change requested but not implemented yet");
|
|
||||||
}
|
|
||||||
AgentCommand::ToggleCollector { name, enabled } => {
|
|
||||||
info!(
|
|
||||||
"Processing ToggleCollector command: {} -> {}",
|
|
||||||
name, enabled
|
|
||||||
);
|
|
||||||
// Note: This would require dynamic collector management
|
|
||||||
info!("Collector toggle requested but not implemented yet");
|
|
||||||
}
|
|
||||||
AgentCommand::Ping => {
|
|
||||||
info!("Processing Ping command - agent is alive");
|
|
||||||
// Could send a response back via ZMQ if needed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Check metrics for user-stopped services that are now active and clear their flags
|
|
||||||
fn clear_user_stopped_flags_for_active_services(&mut self, metrics: &[Metric]) {
|
|
||||||
for metric in metrics {
|
|
||||||
// Look for service status metrics that are active
|
|
||||||
if metric.name.starts_with("service_") && metric.name.ends_with("_status") {
|
|
||||||
if let MetricValue::String(status) = &metric.value {
|
|
||||||
if status == "active" {
|
|
||||||
// Extract service name from metric name (service_nginx_status -> nginx)
|
|
||||||
let service_name = metric.name
|
|
||||||
.strip_prefix("service_")
|
|
||||||
.and_then(|s| s.strip_suffix("_status"))
|
|
||||||
.unwrap_or("");
|
|
||||||
|
|
||||||
if !service_name.is_empty() && UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
|
||||||
info!("Service '{}' is now active - clearing user-stopped flag", service_name);
|
|
||||||
if let Err(e) = self.service_tracker.clear_user_stopped(service_name) {
|
|
||||||
error!("Failed to clear user-stopped flag for '{}': {}", service_name, e);
|
|
||||||
} else {
|
|
||||||
// Sync to global tracker
|
|
||||||
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
|
||||||
debug!("Cleared user-stopped flag for service '{}'", service_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
AgentCommand::SetInterval { seconds } => {
|
||||||
|
info!("Received interval change request: {}s", seconds);
|
||||||
|
// Note: This would require more complex handling to update the interval
|
||||||
|
// For now, just acknowledge
|
||||||
|
}
|
||||||
|
AgentCommand::ToggleCollector { name, enabled } => {
|
||||||
|
info!("Received collector toggle request: {} -> {}", name, enabled);
|
||||||
|
// Note: This would require more complex handling to enable/disable collectors
|
||||||
|
// For now, just acknowledge
|
||||||
|
}
|
||||||
|
AgentCommand::Ping => {
|
||||||
|
info!("Received ping command");
|
||||||
|
// Maybe send back a pong or status
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -1,448 +1,120 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use chrono::Utc;
|
use cm_dashboard_shared::{AgentData, BackupData, BackupDiskData};
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use tokio::fs;
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
use super::{Collector, CollectorError};
|
use super::{Collector, CollectorError};
|
||||||
use tracing::error;
|
|
||||||
|
|
||||||
/// Backup collector that reads TOML status files for borgbackup metrics
|
/// Backup collector that reads backup status from TOML files with structured data output
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct BackupCollector {
|
pub struct BackupCollector {
|
||||||
pub backup_status_file: String,
|
/// Path to backup status file
|
||||||
pub max_age_hours: u64,
|
status_file_path: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BackupCollector {
|
impl BackupCollector {
|
||||||
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
backup_status_file: backup_status_file
|
status_file_path: "/var/lib/backup/backup-status.toml".to_string(),
|
||||||
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
|
||||||
max_age_hours,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Read backup status from TOML file
|
||||||
async fn read_backup_status(&self) -> Result<Option<BackupStatusToml>, CollectorError> {
|
async fn read_backup_status(&self) -> Result<Option<BackupStatusToml>, CollectorError> {
|
||||||
// Check if we're in maintenance mode
|
if !Path::new(&self.status_file_path).exists() {
|
||||||
if std::fs::metadata("/tmp/cm-maintenance").is_ok() {
|
debug!("Backup status file not found: {}", self.status_file_path);
|
||||||
// Return special maintenance mode status
|
return Ok(None);
|
||||||
let maintenance_status = BackupStatusToml {
|
|
||||||
backup_name: "maintenance".to_string(),
|
|
||||||
start_time: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
|
||||||
current_time: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
|
||||||
duration_seconds: 0,
|
|
||||||
status: "pending".to_string(),
|
|
||||||
last_updated: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
|
||||||
disk_space: None,
|
|
||||||
disk_product_name: None,
|
|
||||||
disk_serial_number: None,
|
|
||||||
disk_wear_percent: None,
|
|
||||||
services: HashMap::new(),
|
|
||||||
};
|
|
||||||
return Ok(Some(maintenance_status));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if backup status file exists
|
let content = fs::read_to_string(&self.status_file_path)
|
||||||
if !std::path::Path::new(&self.backup_status_file).exists() {
|
|
||||||
return Ok(None); // File doesn't exist, but this is not an error
|
|
||||||
}
|
|
||||||
|
|
||||||
let content = fs::read_to_string(&self.backup_status_file)
|
|
||||||
.await
|
|
||||||
.map_err(|e| CollectorError::SystemRead {
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
path: self.backup_status_file.clone(),
|
path: self.status_file_path.clone(),
|
||||||
error: e.to_string(),
|
error: e.to_string(),
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let backup_status = toml::from_str(&content).map_err(|e| CollectorError::Parse {
|
let status: BackupStatusToml = toml::from_str(&content)
|
||||||
value: "backup status TOML".to_string(),
|
.map_err(|e| CollectorError::Parse {
|
||||||
error: e.to_string(),
|
value: content.clone(),
|
||||||
})?;
|
error: format!("Failed to parse backup status TOML: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok(Some(backup_status))
|
Ok(Some(status))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn calculate_backup_status(&self, backup_status: &BackupStatusToml) -> Status {
|
/// Convert BackupStatusToml to BackupData and populate AgentData
|
||||||
// Parse the start time to check age - handle both RFC3339 and local timestamp formats
|
async fn populate_backup_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
let start_time = match chrono::DateTime::parse_from_rfc3339(&backup_status.start_time) {
|
if let Some(backup_status) = self.read_backup_status().await? {
|
||||||
Ok(dt) => dt.with_timezone(&Utc),
|
// Use raw start_time string from TOML
|
||||||
Err(_) => {
|
|
||||||
// Try parsing as naive datetime and assume UTC
|
|
||||||
match chrono::NaiveDateTime::parse_from_str(
|
|
||||||
&backup_status.start_time,
|
|
||||||
"%Y-%m-%dT%H:%M:%S%.f",
|
|
||||||
) {
|
|
||||||
Ok(naive_dt) => naive_dt.and_utc(),
|
|
||||||
Err(_) => {
|
|
||||||
error!(
|
|
||||||
"Failed to parse backup timestamp: {}",
|
|
||||||
backup_status.start_time
|
|
||||||
);
|
|
||||||
return Status::Unknown;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let hours_since_backup = Utc::now().signed_duration_since(start_time).num_hours();
|
// Extract disk information
|
||||||
|
let repository_disk = if let Some(disk_space) = &backup_status.disk_space {
|
||||||
|
Some(BackupDiskData {
|
||||||
|
serial: backup_status.disk_serial_number.clone().unwrap_or_else(|| "Unknown".to_string()),
|
||||||
|
usage_percent: disk_space.usage_percent as f32,
|
||||||
|
used_gb: disk_space.used_gb as f32,
|
||||||
|
total_gb: disk_space.total_gb as f32,
|
||||||
|
wear_percent: backup_status.disk_wear_percent,
|
||||||
|
temperature_celsius: None, // Not available in current TOML
|
||||||
|
})
|
||||||
|
} else if let Some(serial) = &backup_status.disk_serial_number {
|
||||||
|
// Fallback: create minimal disk info if we have serial but no disk_space
|
||||||
|
Some(BackupDiskData {
|
||||||
|
serial: serial.clone(),
|
||||||
|
usage_percent: 0.0,
|
||||||
|
used_gb: 0.0,
|
||||||
|
total_gb: 0.0,
|
||||||
|
wear_percent: backup_status.disk_wear_percent,
|
||||||
|
temperature_celsius: None,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
// Check overall backup status
|
// Calculate total repository size from services
|
||||||
match backup_status.status.as_str() {
|
let total_size_gb = backup_status.services
|
||||||
"success" => {
|
.values()
|
||||||
if hours_since_backup > self.max_age_hours as i64 {
|
.map(|service| service.repo_size_bytes as f32 / (1024.0 * 1024.0 * 1024.0))
|
||||||
Status::Warning // Backup too old
|
.sum::<f32>();
|
||||||
} else {
|
|
||||||
Status::Ok
|
let backup_data = BackupData {
|
||||||
}
|
status: backup_status.status,
|
||||||
}
|
total_size_gb: Some(total_size_gb),
|
||||||
"failed" => Status::Critical,
|
repository_health: Some("ok".to_string()), // Derive from status if needed
|
||||||
"warning" => Status::Warning, // Backup completed with warnings
|
repository_disk,
|
||||||
"running" => Status::Ok, // Currently running is OK
|
last_backup_size_gb: None, // Not available in current TOML format
|
||||||
"pending" => Status::Pending, // Maintenance mode or backup starting
|
start_time_raw: Some(backup_status.start_time),
|
||||||
_ => Status::Unknown,
|
};
|
||||||
|
|
||||||
|
agent_data.backup = backup_data;
|
||||||
|
} else {
|
||||||
|
// No backup status available - set default values
|
||||||
|
agent_data.backup = BackupData {
|
||||||
|
status: "unavailable".to_string(),
|
||||||
|
total_size_gb: None,
|
||||||
|
repository_health: None,
|
||||||
|
repository_disk: None,
|
||||||
|
last_backup_size_gb: None,
|
||||||
|
start_time_raw: None,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn calculate_service_status(&self, service: &ServiceStatus) -> Status {
|
Ok(())
|
||||||
match service.status.as_str() {
|
|
||||||
"completed" => {
|
|
||||||
if service.exit_code == 0 {
|
|
||||||
Status::Ok
|
|
||||||
} else {
|
|
||||||
Status::Critical
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"failed" => Status::Critical,
|
|
||||||
"disabled" => Status::Warning, // Service intentionally disabled
|
|
||||||
"running" => Status::Ok,
|
|
||||||
_ => Status::Unknown,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bytes_to_gb(bytes: u64) -> f32 {
|
|
||||||
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Collector for BackupCollector {
|
impl Collector for BackupCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
debug!("Collecting backup status");
|
||||||
let backup_status_option = self.read_backup_status().await?;
|
self.populate_backup_data(agent_data).await
|
||||||
let mut metrics = Vec::new();
|
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
||||||
|
|
||||||
// If no backup status file exists, return minimal metrics indicating no backup system
|
|
||||||
let backup_status = match backup_status_option {
|
|
||||||
Some(status) => status,
|
|
||||||
None => {
|
|
||||||
// No backup system configured - return minimal "unknown" metrics
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_overall_status".to_string(),
|
|
||||||
value: MetricValue::String("no_backup_system".to_string()),
|
|
||||||
status: Status::Unknown,
|
|
||||||
timestamp,
|
|
||||||
description: Some("No backup system configured (no status file found)".to_string()),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
return Ok(metrics);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Overall backup status
|
|
||||||
let overall_status = self.calculate_backup_status(&backup_status);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_overall_status".to_string(),
|
|
||||||
value: MetricValue::String(match overall_status {
|
|
||||||
Status::Ok => "ok".to_string(),
|
|
||||||
Status::Inactive => "inactive".to_string(),
|
|
||||||
Status::Pending => "pending".to_string(),
|
|
||||||
Status::Warning => "warning".to_string(),
|
|
||||||
Status::Critical => "critical".to_string(),
|
|
||||||
Status::Unknown => "unknown".to_string(),
|
|
||||||
Status::Offline => "offline".to_string(),
|
|
||||||
}),
|
|
||||||
status: overall_status,
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!(
|
|
||||||
"Backup: {} at {}",
|
|
||||||
backup_status.status, backup_status.start_time
|
|
||||||
)),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Backup duration
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_duration_seconds".to_string(),
|
|
||||||
value: MetricValue::Integer(backup_status.duration_seconds),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Duration of last backup run".to_string()),
|
|
||||||
unit: Some("seconds".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
|
||||||
let last_updated_dt_result =
|
|
||||||
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
|
||||||
.map(|dt| dt.with_timezone(&Utc))
|
|
||||||
.or_else(|_| {
|
|
||||||
// Try parsing as naive datetime and assume UTC
|
|
||||||
chrono::NaiveDateTime::parse_from_str(
|
|
||||||
&backup_status.last_updated,
|
|
||||||
"%Y-%m-%dT%H:%M:%S%.f",
|
|
||||||
)
|
|
||||||
.map(|naive_dt| naive_dt.and_utc())
|
|
||||||
});
|
|
||||||
|
|
||||||
if let Ok(last_updated_dt) = last_updated_dt_result {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_last_run_timestamp".to_string(),
|
|
||||||
value: MetricValue::Integer(last_updated_dt.timestamp()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Timestamp of last backup completion".to_string()),
|
|
||||||
unit: Some("unix_timestamp".to_string()),
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
error!(
|
|
||||||
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
|
||||||
backup_status.last_updated
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Individual service metrics
|
|
||||||
for (service_name, service) in &backup_status.services {
|
|
||||||
let service_status = self.calculate_service_status(service);
|
|
||||||
|
|
||||||
// Service status
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("backup_service_{}_status", service_name),
|
|
||||||
value: MetricValue::String(match service_status {
|
|
||||||
Status::Ok => "ok".to_string(),
|
|
||||||
Status::Inactive => "inactive".to_string(),
|
|
||||||
Status::Pending => "pending".to_string(),
|
|
||||||
Status::Warning => "warning".to_string(),
|
|
||||||
Status::Critical => "critical".to_string(),
|
|
||||||
Status::Unknown => "unknown".to_string(),
|
|
||||||
Status::Offline => "offline".to_string(),
|
|
||||||
}),
|
|
||||||
status: service_status,
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!(
|
|
||||||
"Backup service {} status: {}",
|
|
||||||
service_name, service.status
|
|
||||||
)),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Service exit code
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("backup_service_{}_exit_code", service_name),
|
|
||||||
value: MetricValue::Integer(service.exit_code),
|
|
||||||
status: if service.exit_code == 0 {
|
|
||||||
Status::Ok
|
|
||||||
} else {
|
|
||||||
Status::Critical
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!("Exit code for backup service {}", service_name)),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Repository archive count
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("backup_service_{}_archive_count", service_name),
|
|
||||||
value: MetricValue::Integer(service.archive_count),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!("Number of archives in {} repository", service_name)),
|
|
||||||
unit: Some("archives".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Repository size in GB
|
|
||||||
let repo_size_gb = Self::bytes_to_gb(service.repo_size_bytes);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("backup_service_{}_repo_size_gb", service_name),
|
|
||||||
value: MetricValue::Float(repo_size_gb),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!("Repository size for {} in GB", service_name)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Repository path for reference
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("backup_service_{}_repo_path", service_name),
|
|
||||||
value: MetricValue::String(service.repo_path.clone()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!("Repository path for {}", service_name)),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Total number of services
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_total_services".to_string(),
|
|
||||||
value: MetricValue::Integer(backup_status.services.len() as i64),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Total number of backup services".to_string()),
|
|
||||||
unit: Some("services".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Calculate total repository size
|
|
||||||
let total_size_bytes: u64 = backup_status
|
|
||||||
.services
|
|
||||||
.values()
|
|
||||||
.map(|s| s.repo_size_bytes)
|
|
||||||
.sum();
|
|
||||||
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_total_repo_size_gb".to_string(),
|
|
||||||
value: MetricValue::Float(total_size_gb),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Total size of all backup repositories".to_string()),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Disk space metrics for backup directory
|
|
||||||
if let Some(ref disk_space) = backup_status.disk_space {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_total_gb".to_string(),
|
|
||||||
value: MetricValue::Float(disk_space.total_gb as f32),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Total disk space available for backups".to_string()),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_used_gb".to_string(),
|
|
||||||
value: MetricValue::Float(disk_space.used_gb as f32),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Used disk space on backup drive".to_string()),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_available_gb".to_string(),
|
|
||||||
value: MetricValue::Float(disk_space.available_gb as f32),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Available disk space on backup drive".to_string()),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_usage_percent".to_string(),
|
|
||||||
value: MetricValue::Float(disk_space.usage_percent as f32),
|
|
||||||
status: if disk_space.usage_percent >= 95.0 {
|
|
||||||
Status::Critical
|
|
||||||
} else if disk_space.usage_percent >= 85.0 {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
description: Some("Backup disk usage percentage".to_string()),
|
|
||||||
unit: Some("percent".to_string()),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Add disk identification metrics if available from disk_space
|
|
||||||
if let Some(ref product_name) = disk_space.product_name {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_product_name".to_string(),
|
|
||||||
value: MetricValue::String(product_name.clone()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Backup disk product name from SMART data".to_string()),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(ref serial_number) = disk_space.serial_number {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_serial_number".to_string(),
|
|
||||||
value: MetricValue::String(serial_number.clone()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Backup disk serial number from SMART data".to_string()),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add standalone disk identification metrics from TOML fields
|
|
||||||
if let Some(ref product_name) = backup_status.disk_product_name {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_product_name".to_string(),
|
|
||||||
value: MetricValue::String(product_name.clone()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Backup disk product name from SMART data".to_string()),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(ref serial_number) = backup_status.disk_serial_number {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_serial_number".to_string(),
|
|
||||||
value: MetricValue::String(serial_number.clone()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Backup disk serial number from SMART data".to_string()),
|
|
||||||
unit: None,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(wear_percent) = backup_status.disk_wear_percent {
|
|
||||||
let wear_status = if wear_percent >= 90.0 {
|
|
||||||
Status::Critical
|
|
||||||
} else if wear_percent >= 75.0 {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
};
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "backup_disk_wear_percent".to_string(),
|
|
||||||
value: MetricValue::Float(wear_percent),
|
|
||||||
status: wear_status,
|
|
||||||
timestamp,
|
|
||||||
description: Some("Backup disk wear percentage from SMART data".to_string()),
|
|
||||||
unit: Some("percent".to_string()),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Count services by status
|
|
||||||
let mut status_counts = HashMap::new();
|
|
||||||
for service in backup_status.services.values() {
|
|
||||||
*status_counts.entry(service.status.clone()).or_insert(0) += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (status_name, count) in status_counts {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("backup_services_{}_count", status_name),
|
|
||||||
value: MetricValue::Integer(count),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
description: Some(format!("Number of services with status: {}", status_name)),
|
|
||||||
unit: Some("services".to_string()),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(metrics)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// TOML structure for backup status file
|
/// TOML structure for backup status file
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct BackupStatusToml {
|
struct BackupStatusToml {
|
||||||
pub backup_name: String,
|
pub backup_name: String,
|
||||||
pub start_time: String,
|
pub start_time: String,
|
||||||
pub current_time: String,
|
pub current_time: String,
|
||||||
@@ -456,8 +128,8 @@ pub struct BackupStatusToml {
|
|||||||
pub services: HashMap<String, ServiceStatus>,
|
pub services: HashMap<String, ServiceStatus>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct DiskSpace {
|
struct DiskSpace {
|
||||||
pub total_bytes: u64,
|
pub total_bytes: u64,
|
||||||
pub used_bytes: u64,
|
pub used_bytes: u64,
|
||||||
pub available_bytes: u64,
|
pub available_bytes: u64,
|
||||||
@@ -465,13 +137,10 @@ pub struct DiskSpace {
|
|||||||
pub used_gb: f64,
|
pub used_gb: f64,
|
||||||
pub available_gb: f64,
|
pub available_gb: f64,
|
||||||
pub usage_percent: f64,
|
pub usage_percent: f64,
|
||||||
// Optional disk identification fields
|
|
||||||
pub product_name: Option<String>,
|
|
||||||
pub serial_number: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct ServiceStatus {
|
struct ServiceStatus {
|
||||||
pub status: String,
|
pub status: String,
|
||||||
pub exit_code: i64,
|
pub exit_code: i64,
|
||||||
pub repo_path: String,
|
pub repo_path: String,
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds};
|
||||||
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
@@ -38,19 +38,31 @@ impl CpuCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate CPU load status using hysteresis thresholds
|
/// Calculate CPU load status using thresholds
|
||||||
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
|
fn calculate_load_status(&self, load: f32) -> Status {
|
||||||
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
|
if load >= self.load_thresholds.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if load >= self.load_thresholds.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate CPU temperature status using hysteresis thresholds
|
/// Calculate CPU temperature status using thresholds
|
||||||
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
|
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||||
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
|
if temp >= self.temperature_thresholds.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if temp >= self.temperature_thresholds.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU load averages from /proc/loadavg
|
/// Collect CPU load averages and populate AgentData
|
||||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||||
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect_load_averages(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||||
|
|
||||||
@@ -65,53 +77,25 @@ impl CpuCollector {
|
|||||||
let load_5min = utils::parse_f32(parts[1])?;
|
let load_5min = utils::parse_f32(parts[1])?;
|
||||||
let load_15min = utils::parse_f32(parts[2])?;
|
let load_15min = utils::parse_f32(parts[2])?;
|
||||||
|
|
||||||
// Only apply thresholds to 5-minute load average
|
// Populate CPU data directly
|
||||||
let load_1min_status = Status::Ok; // No alerting on 1min
|
agent_data.system.cpu.load_1min = load_1min;
|
||||||
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
|
agent_data.system.cpu.load_5min = load_5min;
|
||||||
let load_15min_status = Status::Ok; // No alerting on 15min
|
agent_data.system.cpu.load_15min = load_15min;
|
||||||
|
|
||||||
Ok(vec![
|
Ok(())
|
||||||
Metric::new(
|
|
||||||
registry::CPU_LOAD_1MIN.to_string(),
|
|
||||||
MetricValue::Float(load_1min),
|
|
||||||
load_1min_status,
|
|
||||||
)
|
|
||||||
.with_description("CPU load average over 1 minute".to_string()),
|
|
||||||
Metric::new(
|
|
||||||
registry::CPU_LOAD_5MIN.to_string(),
|
|
||||||
MetricValue::Float(load_5min),
|
|
||||||
load_5min_status,
|
|
||||||
)
|
|
||||||
.with_description("CPU load average over 5 minutes".to_string()),
|
|
||||||
Metric::new(
|
|
||||||
registry::CPU_LOAD_15MIN.to_string(),
|
|
||||||
MetricValue::Float(load_15min),
|
|
||||||
load_15min_status,
|
|
||||||
)
|
|
||||||
.with_description("CPU load average over 15 minutes".to_string()),
|
|
||||||
])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU temperature from thermal zones
|
/// Collect CPU temperature and populate AgentData
|
||||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
/// Prioritizes x86_pkg_temp over generic thermal zones
|
||||||
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
|
async fn collect_temperature(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||||
if let Ok(temp) = self
|
if let Ok(temp) = self
|
||||||
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
let temp_celsius = temp as f32 / 1000.0;
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
||||||
|
return Ok(());
|
||||||
return Ok(Some(
|
|
||||||
Metric::new(
|
|
||||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
|
||||||
MetricValue::Float(temp_celsius),
|
|
||||||
status,
|
|
||||||
)
|
|
||||||
.with_description("CPU package temperature".to_string())
|
|
||||||
.with_unit("°C".to_string()),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: try other thermal zones
|
// Fallback: try other thermal zones
|
||||||
@@ -119,22 +103,14 @@ impl CpuCollector {
|
|||||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||||
let temp_celsius = temp as f32 / 1000.0;
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
||||||
|
return Ok(());
|
||||||
return Ok(Some(
|
|
||||||
Metric::new(
|
|
||||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
|
||||||
MetricValue::Float(temp_celsius),
|
|
||||||
status,
|
|
||||||
)
|
|
||||||
.with_description(format!("CPU temperature from thermal_zone{}", zone_id))
|
|
||||||
.with_unit("°C".to_string()),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("No CPU temperature sensors found");
|
debug!("No CPU temperature sensors found");
|
||||||
Ok(None)
|
// Leave temperature as None if not available
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read temperature from thermal zone efficiently
|
/// Read temperature from thermal zone efficiently
|
||||||
@@ -143,24 +119,16 @@ impl CpuCollector {
|
|||||||
utils::parse_u64(content.trim())
|
utils::parse_u64(content.trim())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
|
/// Collect CPU frequency and populate AgentData
|
||||||
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
|
async fn collect_frequency(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
// Try scaling frequency first (more accurate for current frequency)
|
// Try scaling frequency first (more accurate for current frequency)
|
||||||
if let Ok(freq) =
|
if let Ok(freq) =
|
||||||
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
|
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
|
||||||
{
|
{
|
||||||
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
||||||
let freq_mhz = freq_khz as f32 / 1000.0;
|
let freq_mhz = freq_khz as f32 / 1000.0;
|
||||||
|
agent_data.system.cpu.frequency_mhz = freq_mhz;
|
||||||
return Ok(Some(
|
return Ok(());
|
||||||
Metric::new(
|
|
||||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
|
||||||
MetricValue::Float(freq_mhz),
|
|
||||||
Status::Ok, // Frequency doesn't have status thresholds
|
|
||||||
)
|
|
||||||
.with_description("Current CPU frequency".to_string())
|
|
||||||
.with_unit("MHz".to_string()),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,17 +138,8 @@ impl CpuCollector {
|
|||||||
if line.starts_with("cpu MHz") {
|
if line.starts_with("cpu MHz") {
|
||||||
if let Some(freq_str) = line.split(':').nth(1) {
|
if let Some(freq_str) = line.split(':').nth(1) {
|
||||||
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
||||||
return Ok(Some(
|
agent_data.system.cpu.frequency_mhz = freq_mhz;
|
||||||
Metric::new(
|
return Ok(());
|
||||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
|
||||||
MetricValue::Float(freq_mhz),
|
|
||||||
Status::Ok,
|
|
||||||
)
|
|
||||||
.with_description(
|
|
||||||
"CPU base frequency from /proc/cpuinfo".to_string(),
|
|
||||||
)
|
|
||||||
.with_unit("MHz".to_string()),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break; // Only need first CPU entry
|
break; // Only need first CPU entry
|
||||||
@@ -189,38 +148,28 @@ impl CpuCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
debug!("CPU frequency not available");
|
debug!("CPU frequency not available");
|
||||||
Ok(None)
|
// Leave frequency as 0.0 if not available
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Collector for CpuCollector {
|
impl Collector for CpuCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
|
||||||
debug!("Collecting CPU metrics");
|
debug!("Collecting CPU metrics");
|
||||||
let start = std::time::Instant::now();
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
|
||||||
|
|
||||||
// Collect load averages (always available)
|
// Collect load averages (always available)
|
||||||
metrics.extend(self.collect_load_averages(status_tracker).await?);
|
self.collect_load_averages(agent_data).await?;
|
||||||
|
|
||||||
// Collect temperature (optional)
|
// Collect temperature (optional)
|
||||||
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
|
self.collect_temperature(agent_data).await?;
|
||||||
metrics.push(temp_metric);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect frequency (optional)
|
// Collect frequency (optional)
|
||||||
if let Some(freq_metric) = self.collect_frequency().await? {
|
self.collect_frequency(agent_data).await?;
|
||||||
metrics.push(freq_metric);
|
|
||||||
}
|
|
||||||
|
|
||||||
let duration = start.elapsed();
|
let duration = start.elapsed();
|
||||||
debug!(
|
debug!("CPU collection completed in {:?}", duration);
|
||||||
"CPU collection completed in {:?} with {} metrics",
|
|
||||||
duration,
|
|
||||||
metrics.len()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Efficiency check: warn if collection takes too long
|
// Efficiency check: warn if collection takes too long
|
||||||
if duration.as_millis() > 1 {
|
if duration.as_millis() > 1 {
|
||||||
@@ -230,10 +179,14 @@ impl Collector for CpuCollector {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store performance metrics
|
// Calculate status using thresholds
|
||||||
// Performance tracking handled by cache system
|
agent_data.system.cpu.load_status = self.calculate_load_status(agent_data.system.cpu.load_1min);
|
||||||
|
agent_data.system.cpu.temperature_status = if let Some(temp) = agent_data.system.cpu.temperature_celsius {
|
||||||
|
self.calculate_temperature_status(temp)
|
||||||
|
} else {
|
||||||
|
Status::Unknown
|
||||||
|
};
|
||||||
|
|
||||||
Ok(metrics)
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status};
|
||||||
|
|
||||||
use crate::config::DiskConfig;
|
use crate::config::DiskConfig;
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
@@ -10,7 +10,7 @@ use tracing::debug;
|
|||||||
|
|
||||||
use super::{Collector, CollectorError};
|
use super::{Collector, CollectorError};
|
||||||
|
|
||||||
/// Storage collector with clean architecture
|
/// Storage collector with clean architecture and structured data output
|
||||||
pub struct DiskCollector {
|
pub struct DiskCollector {
|
||||||
config: DiskConfig,
|
config: DiskConfig,
|
||||||
temperature_thresholds: HysteresisThresholds,
|
temperature_thresholds: HysteresisThresholds,
|
||||||
@@ -19,55 +19,44 @@ pub struct DiskCollector {
|
|||||||
/// A physical drive with its filesystems
|
/// A physical drive with its filesystems
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct PhysicalDrive {
|
struct PhysicalDrive {
|
||||||
device: String, // e.g., "nvme0n1", "sda"
|
name: String, // e.g., "nvme0n1", "sda"
|
||||||
filesystems: Vec<Filesystem>, // mounted filesystems on this drive
|
health: String, // SMART health status
|
||||||
temperature: Option<f32>, // drive temperature
|
filesystems: Vec<Filesystem>, // mounted filesystems on this drive
|
||||||
wear_level: Option<f32>, // SSD wear level
|
|
||||||
health_status: String, // SMART health
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A mergerfs pool
|
/// A filesystem mounted on a drive
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct MergerfsPool {
|
|
||||||
mount_point: String, // e.g., "/srv/media"
|
|
||||||
total_bytes: u64, // pool total capacity
|
|
||||||
used_bytes: u64, // pool used space
|
|
||||||
data_drives: Vec<DriveInfo>, // data member drives
|
|
||||||
parity_drives: Vec<DriveInfo>, // parity drives
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Individual filesystem on a drive
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct Filesystem {
|
struct Filesystem {
|
||||||
mount_point: String, // e.g., "/", "/boot"
|
mount_point: String, // e.g., "/", "/boot"
|
||||||
total_bytes: u64, // filesystem capacity
|
usage_percent: f32, // Usage percentage
|
||||||
used_bytes: u64, // filesystem used space
|
used_bytes: u64, // Used bytes
|
||||||
|
total_bytes: u64, // Total bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Drive information for pools
|
/// MergerFS pool
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct DriveInfo {
|
struct MergerfsPool {
|
||||||
device: String, // e.g., "sdb", "sdc"
|
name: String, // e.g., "srv_media"
|
||||||
mount_point: String, // e.g., "/mnt/disk1"
|
mount_point: String, // e.g., "/srv/media"
|
||||||
temperature: Option<f32>, // drive temperature
|
total_bytes: u64, // Pool total bytes
|
||||||
wear_level: Option<f32>, // SSD wear level
|
used_bytes: u64, // Pool used bytes
|
||||||
health_status: String, // SMART health
|
data_drives: Vec<PoolDrive>, // Data drives in pool
|
||||||
|
parity_drives: Vec<PoolDrive>, // Parity drives in pool
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovered storage topology
|
/// Drive in a storage pool
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
struct StorageTopology {
|
struct PoolDrive {
|
||||||
physical_drives: Vec<PhysicalDrive>,
|
name: String, // Drive name
|
||||||
mergerfs_pools: Vec<MergerfsPool>,
|
mount_point: String, // e.g., "/mnt/disk1"
|
||||||
|
temperature_celsius: Option<f32>, // Drive temperature
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DiskCollector {
|
impl DiskCollector {
|
||||||
pub fn new(config: DiskConfig) -> Self {
|
pub fn new(config: DiskConfig) -> Self {
|
||||||
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
let temperature_thresholds = HysteresisThresholds::new(
|
||||||
config.temperature_warning_celsius,
|
config.temperature_warning_celsius,
|
||||||
5.0,
|
|
||||||
config.temperature_critical_celsius,
|
config.temperature_critical_celsius,
|
||||||
5.0,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
@@ -76,72 +65,85 @@ impl DiskCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discover all storage using clean workflow: lsblk → df → group
|
/// Collect all storage data and populate AgentData
|
||||||
fn discover_storage(&self) -> Result<StorageTopology> {
|
async fn collect_storage_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
debug!("Starting storage discovery");
|
let start_time = Instant::now();
|
||||||
|
debug!("Starting clean storage collection");
|
||||||
|
|
||||||
// Step 1: Get all mount points and their backing devices using lsblk
|
// Step 1: Get mount points and their backing devices
|
||||||
let mount_devices = self.get_mount_devices()?;
|
let mount_devices = self.get_mount_devices().await?;
|
||||||
debug!("Found {} mount points", mount_devices.len());
|
|
||||||
|
|
||||||
// Step 2: Get filesystem usage for each mount point using df
|
// Step 2: Get filesystem usage for each mount point using df
|
||||||
let filesystem_usage = self.get_filesystem_usage(&mount_devices)?;
|
let mut filesystem_usage = self.get_filesystem_usage(&mount_devices).map_err(|e| CollectorError::Parse {
|
||||||
debug!("Got usage data for {} filesystems", filesystem_usage.len());
|
value: "filesystem usage".to_string(),
|
||||||
|
error: format!("Failed to get filesystem usage: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
// Step 3: Detect mergerfs pools from /proc/mounts
|
// Step 2.5: Add MergerFS mount points that weren't in lsblk output
|
||||||
let mergerfs_pools = self.discover_mergerfs_pools()?;
|
self.add_mergerfs_filesystem_usage(&mut filesystem_usage).map_err(|e| CollectorError::Parse {
|
||||||
debug!("Found {} mergerfs pools", mergerfs_pools.len());
|
value: "mergerfs filesystem usage".to_string(),
|
||||||
|
error: format!("Failed to get mergerfs filesystem usage: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
// Step 4: Group regular filesystems by physical drive
|
// Step 3: Detect MergerFS pools
|
||||||
let physical_drives = self.group_by_physical_drive(&mount_devices, &filesystem_usage, &mergerfs_pools)?;
|
let mergerfs_pools = self.detect_mergerfs_pools(&filesystem_usage).map_err(|e| CollectorError::Parse {
|
||||||
debug!("Grouped into {} physical drives", physical_drives.len());
|
value: "mergerfs pools".to_string(),
|
||||||
|
error: format!("Failed to detect mergerfs pools: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok(StorageTopology {
|
// Step 4: Group filesystems by physical drive (excluding mergerfs members)
|
||||||
physical_drives,
|
let physical_drives = self.group_by_physical_drive(&mount_devices, &filesystem_usage, &mergerfs_pools).map_err(|e| CollectorError::Parse {
|
||||||
mergerfs_pools,
|
value: "physical drives".to_string(),
|
||||||
})
|
error: format!("Failed to group by physical drive: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Step 5: Get SMART data for all drives
|
||||||
|
let smart_data = self.get_smart_data_for_drives(&physical_drives, &mergerfs_pools).await;
|
||||||
|
|
||||||
|
// Step 6: Populate AgentData
|
||||||
|
self.populate_drives_data(&physical_drives, &smart_data, agent_data)?;
|
||||||
|
self.populate_pools_data(&mergerfs_pools, &smart_data, agent_data)?;
|
||||||
|
|
||||||
|
let elapsed = start_time.elapsed();
|
||||||
|
debug!("Storage collection completed in {:?}", elapsed);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Use lsblk to get mount points and their backing devices
|
/// Get block devices and their mount points using lsblk
|
||||||
fn get_mount_devices(&self) -> Result<HashMap<String, String>> {
|
async fn get_mount_devices(&self) -> Result<HashMap<String, String>, CollectorError> {
|
||||||
let output = Command::new("lsblk")
|
let output = Command::new("lsblk")
|
||||||
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
.args(&["-rn", "-o", "NAME,MOUNTPOINT"])
|
||||||
.output()?;
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
if !output.status.success() {
|
path: "block devices".to_string(),
|
||||||
return Err(anyhow::anyhow!("lsblk command failed"));
|
error: e.to_string(),
|
||||||
}
|
})?;
|
||||||
|
|
||||||
let mut mount_devices = HashMap::new();
|
let mut mount_devices = HashMap::new();
|
||||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
for line in String::from_utf8_lossy(&output.stdout).lines() {
|
||||||
|
|
||||||
for line in output_str.lines() {
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
if parts.len() >= 2 {
|
if parts.len() >= 2 {
|
||||||
let device_name = parts[0]
|
let device_name = parts[0];
|
||||||
.trim_start_matches(&['├', '└', '─', ' '][..]);
|
|
||||||
let mount_point = parts[1];
|
let mount_point = parts[1];
|
||||||
|
|
||||||
// Skip unwanted mount points
|
// Skip swap partitions and unmounted devices
|
||||||
if self.should_skip_mount_point(mount_point) {
|
if mount_point == "[SWAP]" || mount_point.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
mount_devices.insert(mount_point.to_string(), device_name.to_string());
|
// Convert device name to full path
|
||||||
|
let device_path = format!("/dev/{}", device_name);
|
||||||
|
mount_devices.insert(mount_point.to_string(), device_path);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debug!("Found {} mounted block devices", mount_devices.len());
|
||||||
Ok(mount_devices)
|
Ok(mount_devices)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if we should skip this mount point
|
|
||||||
fn should_skip_mount_point(&self, mount_point: &str) -> bool {
|
|
||||||
let skip_prefixes = ["/proc", "/sys", "/dev", "/tmp", "/run"];
|
|
||||||
skip_prefixes.iter().any(|prefix| mount_point.starts_with(prefix))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Use df to get filesystem usage for mount points
|
/// Use df to get filesystem usage for mount points
|
||||||
fn get_filesystem_usage(&self, mount_devices: &HashMap<String, String>) -> Result<HashMap<String, (u64, u64)>> {
|
fn get_filesystem_usage(&self, mount_devices: &HashMap<String, String>) -> anyhow::Result<HashMap<String, (u64, u64)>> {
|
||||||
let mut filesystem_usage = HashMap::new();
|
let mut filesystem_usage = HashMap::new();
|
||||||
|
|
||||||
for mount_point in mount_devices.keys() {
|
for mount_point in mount_devices.keys() {
|
||||||
@@ -158,38 +160,76 @@ impl DiskCollector {
|
|||||||
Ok(filesystem_usage)
|
Ok(filesystem_usage)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get filesystem info using df command
|
/// Add filesystem usage for MergerFS mount points that aren't in lsblk
|
||||||
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
fn add_mergerfs_filesystem_usage(&self, filesystem_usage: &mut HashMap<String, (u64, u64)>) -> anyhow::Result<()> {
|
||||||
let output = Command::new("df")
|
let mounts_content = std::fs::read_to_string("/proc/mounts")
|
||||||
.arg("--block-size=1")
|
.map_err(|e| anyhow::anyhow!("Failed to read /proc/mounts: {}", e))?;
|
||||||
.arg(path)
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
for line in mounts_content.lines() {
|
||||||
return Err(anyhow::anyhow!("df command failed for {}", path));
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 3 && parts[2] == "fuse.mergerfs" {
|
||||||
|
let mount_point = parts[1].to_string();
|
||||||
|
|
||||||
|
// Only add if we don't already have usage data for this mount point
|
||||||
|
if !filesystem_usage.contains_key(&mount_point) {
|
||||||
|
if let Ok((total, used)) = self.get_filesystem_info(&mount_point) {
|
||||||
|
debug!("Added MergerFS filesystem usage for {}: {}GB total, {}GB used",
|
||||||
|
mount_point, total as f32 / (1024.0 * 1024.0 * 1024.0), used as f32 / (1024.0 * 1024.0 * 1024.0));
|
||||||
|
filesystem_usage.insert(mount_point, (total, used));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let output_str = String::from_utf8(output.stdout)?;
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get filesystem info for a single mount point
|
||||||
|
fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> {
|
||||||
|
let output = Command::new("df")
|
||||||
|
.args(&["--block-size=1", mount_point])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: format!("df {}", mount_point),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
let lines: Vec<&str> = output_str.lines().collect();
|
let lines: Vec<&str> = output_str.lines().collect();
|
||||||
|
|
||||||
if lines.len() < 2 {
|
if lines.len() < 2 {
|
||||||
return Err(anyhow::anyhow!("Unexpected df output format"));
|
return Err(CollectorError::Parse {
|
||||||
|
value: output_str.to_string(),
|
||||||
|
error: "Expected at least 2 lines from df output".to_string(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
// Parse the data line (skip header)
|
||||||
if fields.len() < 4 {
|
let parts: Vec<&str> = lines[1].split_whitespace().collect();
|
||||||
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
if parts.len() < 4 {
|
||||||
|
return Err(CollectorError::Parse {
|
||||||
|
value: lines[1].to_string(),
|
||||||
|
error: "Expected at least 4 fields in df output".to_string(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let total_bytes = fields[1].parse::<u64>()?;
|
let total_bytes: u64 = parts[1].parse().map_err(|e| CollectorError::Parse {
|
||||||
let used_bytes = fields[2].parse::<u64>()?;
|
value: parts[1].to_string(),
|
||||||
|
error: format!("Failed to parse total bytes: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let used_bytes: u64 = parts[2].parse().map_err(|e| CollectorError::Parse {
|
||||||
|
value: parts[2].to_string(),
|
||||||
|
error: format!("Failed to parse used bytes: {}", e),
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok((total_bytes, used_bytes))
|
Ok((total_bytes, used_bytes))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discover mergerfs pools from /proc/mounts
|
/// Detect MergerFS pools from mount data
|
||||||
fn discover_mergerfs_pools(&self) -> Result<Vec<MergerfsPool>> {
|
fn detect_mergerfs_pools(&self, filesystem_usage: &HashMap<String, (u64, u64)>) -> anyhow::Result<Vec<MergerfsPool>> {
|
||||||
let mounts_content = std::fs::read_to_string("/proc/mounts")?;
|
let mounts_content = std::fs::read_to_string("/proc/mounts")
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to read /proc/mounts: {}", e))?;
|
||||||
let mut pools = Vec::new();
|
let mut pools = Vec::new();
|
||||||
|
|
||||||
for line in mounts_content.lines() {
|
for line in mounts_content.lines() {
|
||||||
@@ -199,20 +239,54 @@ impl DiskCollector {
|
|||||||
let device_sources = parts[0]; // e.g., "/mnt/disk1:/mnt/disk2"
|
let device_sources = parts[0]; // e.g., "/mnt/disk1:/mnt/disk2"
|
||||||
|
|
||||||
// Get pool usage
|
// Get pool usage
|
||||||
let (total_bytes, used_bytes) = self.get_filesystem_info(&mount_point)
|
let (total_bytes, used_bytes) = filesystem_usage.get(&mount_point)
|
||||||
|
.copied()
|
||||||
.unwrap_or((0, 0));
|
.unwrap_or((0, 0));
|
||||||
|
|
||||||
// Parse member paths
|
// Extract pool name from mount point (e.g., "/srv/media" -> "srv_media")
|
||||||
let member_paths: Vec<String> = device_sources
|
let pool_name = if mount_point == "/" {
|
||||||
|
"root".to_string()
|
||||||
|
} else {
|
||||||
|
mount_point.trim_start_matches('/').replace('/', "_")
|
||||||
|
};
|
||||||
|
|
||||||
|
if pool_name.is_empty() {
|
||||||
|
debug!("Skipping mergerfs pool with empty name: {}", mount_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse member paths - handle both full paths and numeric references
|
||||||
|
let raw_paths: Vec<String> = device_sources
|
||||||
.split(':')
|
.split(':')
|
||||||
.map(|s| s.trim().to_string())
|
.map(|s| s.trim().to_string())
|
||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Convert numeric references to actual mount points if needed
|
||||||
|
let member_paths = if raw_paths.iter().any(|path| !path.starts_with('/')) {
|
||||||
|
// Handle numeric format like "1:2" by finding corresponding /mnt/disk* paths
|
||||||
|
self.resolve_numeric_mergerfs_paths(&raw_paths)?
|
||||||
|
} else {
|
||||||
|
// Already full paths
|
||||||
|
raw_paths
|
||||||
|
};
|
||||||
|
|
||||||
|
// For SnapRAID setups, include parity drives that are related to this pool's data drives
|
||||||
|
let mut all_member_paths = member_paths.clone();
|
||||||
|
let related_parity_paths = self.discover_related_parity_drives(&member_paths)?;
|
||||||
|
all_member_paths.extend(related_parity_paths);
|
||||||
|
|
||||||
// Categorize as data vs parity drives
|
// Categorize as data vs parity drives
|
||||||
let (data_drives, parity_drives) = self.categorize_pool_drives(&member_paths)?;
|
let (data_drives, parity_drives) = match self.categorize_pool_drives(&all_member_paths) {
|
||||||
|
Ok(drives) => drives,
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to categorize drives for pool {}: {}. Skipping.", mount_point, e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
pools.push(MergerfsPool {
|
pools.push(MergerfsPool {
|
||||||
|
name: pool_name,
|
||||||
mount_point,
|
mount_point,
|
||||||
total_bytes,
|
total_bytes,
|
||||||
used_bytes,
|
used_bytes,
|
||||||
@@ -222,111 +296,20 @@ impl DiskCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debug!("Found {} mergerfs pools", pools.len());
|
||||||
Ok(pools)
|
Ok(pools)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Categorize pool member drives as data vs parity
|
/// Group filesystems by physical drive (excluding mergerfs members) - exact old logic
|
||||||
fn categorize_pool_drives(&self, member_paths: &[String]) -> Result<(Vec<DriveInfo>, Vec<DriveInfo>)> {
|
|
||||||
let mut data_drives = Vec::new();
|
|
||||||
let mut parity_drives = Vec::new();
|
|
||||||
|
|
||||||
for path in member_paths {
|
|
||||||
let drive_info = self.get_drive_info_for_path(path)?;
|
|
||||||
|
|
||||||
// Heuristic: if path contains "parity", it's parity
|
|
||||||
if path.to_lowercase().contains("parity") {
|
|
||||||
parity_drives.push(drive_info);
|
|
||||||
} else {
|
|
||||||
data_drives.push(drive_info);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((data_drives, parity_drives))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get drive information for a mount path
|
|
||||||
fn get_drive_info_for_path(&self, path: &str) -> Result<DriveInfo> {
|
|
||||||
// Use lsblk to find the backing device
|
|
||||||
let output = Command::new("lsblk")
|
|
||||||
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let mut device = String::new();
|
|
||||||
|
|
||||||
for line in output_str.lines() {
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
||||||
if parts.len() >= 2 && parts[1] == path {
|
|
||||||
device = parts[0]
|
|
||||||
.trim_start_matches('├')
|
|
||||||
.trim_start_matches('└')
|
|
||||||
.trim_start_matches('─')
|
|
||||||
.trim()
|
|
||||||
.to_string();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if device.is_empty() {
|
|
||||||
return Err(anyhow::anyhow!("Could not find device for path {}", path));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract base device name (e.g., "sda1" -> "sda")
|
|
||||||
let base_device = self.extract_base_device(&device);
|
|
||||||
|
|
||||||
// Get SMART data
|
|
||||||
let (health, temperature, wear) = self.get_smart_data(&format!("/dev/{}", base_device));
|
|
||||||
|
|
||||||
Ok(DriveInfo {
|
|
||||||
device: base_device,
|
|
||||||
mount_point: path.to_string(),
|
|
||||||
temperature,
|
|
||||||
wear_level: wear,
|
|
||||||
health_status: health,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract base device name from partition (e.g., "nvme0n1p2" -> "nvme0n1", "sda1" -> "sda")
|
|
||||||
fn extract_base_device(&self, device_name: &str) -> String {
|
|
||||||
// Handle NVMe devices (nvme0n1p1 -> nvme0n1)
|
|
||||||
if device_name.starts_with("nvme") {
|
|
||||||
if let Some(p_pos) = device_name.find('p') {
|
|
||||||
return device_name[..p_pos].to_string();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle traditional devices (sda1 -> sda)
|
|
||||||
if device_name.len() > 1 {
|
|
||||||
let chars: Vec<char> = device_name.chars().collect();
|
|
||||||
let mut end_idx = chars.len();
|
|
||||||
|
|
||||||
// Find where the device name ends and partition number begins
|
|
||||||
for (i, &c) in chars.iter().enumerate().rev() {
|
|
||||||
if !c.is_ascii_digit() {
|
|
||||||
end_idx = i + 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if end_idx > 0 && end_idx < chars.len() {
|
|
||||||
return chars[..end_idx].iter().collect();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If no partition detected, return as-is
|
|
||||||
device_name.to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Group filesystems by physical drive (excluding mergerfs members)
|
|
||||||
fn group_by_physical_drive(
|
fn group_by_physical_drive(
|
||||||
&self,
|
&self,
|
||||||
mount_devices: &HashMap<String, String>,
|
mount_devices: &HashMap<String, String>,
|
||||||
filesystem_usage: &HashMap<String, (u64, u64)>,
|
filesystem_usage: &HashMap<String, (u64, u64)>,
|
||||||
mergerfs_pools: &[MergerfsPool]
|
mergerfs_pools: &[MergerfsPool]
|
||||||
) -> Result<Vec<PhysicalDrive>> {
|
) -> anyhow::Result<Vec<PhysicalDrive>> {
|
||||||
let mut drive_groups: HashMap<String, Vec<Filesystem>> = HashMap::new();
|
let mut drive_groups: HashMap<String, Vec<Filesystem>> = HashMap::new();
|
||||||
|
|
||||||
// Get all mergerfs member paths to exclude them
|
// Get all mergerfs member paths to exclude them - exactly like old code
|
||||||
let mut mergerfs_members = std::collections::HashSet::new();
|
let mut mergerfs_members = std::collections::HashSet::new();
|
||||||
for pool in mergerfs_pools {
|
for pool in mergerfs_pools {
|
||||||
for drive in &pool.data_drives {
|
for drive in &pool.data_drives {
|
||||||
@@ -347,558 +330,531 @@ impl DiskCollector {
|
|||||||
let base_device = self.extract_base_device(device);
|
let base_device = self.extract_base_device(device);
|
||||||
|
|
||||||
if let Some((total, used)) = filesystem_usage.get(mount_point) {
|
if let Some((total, used)) = filesystem_usage.get(mount_point) {
|
||||||
|
let usage_percent = (*used as f32 / *total as f32) * 100.0;
|
||||||
|
|
||||||
let filesystem = Filesystem {
|
let filesystem = Filesystem {
|
||||||
mount_point: mount_point.clone(),
|
mount_point: mount_point.clone(), // Keep actual mount point like "/" and "/boot"
|
||||||
total_bytes: *total,
|
usage_percent,
|
||||||
used_bytes: *used,
|
used_bytes: *used,
|
||||||
|
total_bytes: *total,
|
||||||
};
|
};
|
||||||
|
|
||||||
drive_groups.entry(base_device).or_insert_with(Vec::new).push(filesystem);
|
drive_groups.entry(base_device).or_insert_with(Vec::new).push(filesystem);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert to PhysicalDrive structs with SMART data
|
// Convert to PhysicalDrive structs
|
||||||
let mut physical_drives = Vec::new();
|
let mut physical_drives = Vec::new();
|
||||||
for (device, filesystems) in drive_groups {
|
for (drive_name, filesystems) in drive_groups {
|
||||||
let (health, temperature, wear) = self.get_smart_data(&format!("/dev/{}", device));
|
let physical_drive = PhysicalDrive {
|
||||||
|
name: drive_name,
|
||||||
physical_drives.push(PhysicalDrive {
|
health: "UNKNOWN".to_string(), // Will be updated with SMART data
|
||||||
device,
|
|
||||||
filesystems,
|
filesystems,
|
||||||
temperature,
|
};
|
||||||
wear_level: wear,
|
physical_drives.push(physical_drive);
|
||||||
health_status: health,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
physical_drives.sort_by(|a, b| a.name.cmp(&b.name));
|
||||||
Ok(physical_drives)
|
Ok(physical_drives)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get SMART data for a drive
|
/// Extract base device name from device path
|
||||||
fn get_smart_data(&self, device_path: &str) -> (String, Option<f32>, Option<f32>) {
|
fn extract_base_device(&self, device: &str) -> String {
|
||||||
let output = Command::new("sudo")
|
// Extract base device name (e.g., "/dev/nvme0n1p1" -> "nvme0n1")
|
||||||
.arg("smartctl")
|
if let Some(dev_name) = device.strip_prefix("/dev/") {
|
||||||
.arg("-a")
|
// Remove partition numbers: nvme0n1p1 -> nvme0n1, sda1 -> sda
|
||||||
.arg(device_path)
|
if let Some(pos) = dev_name.find('p') {
|
||||||
.output();
|
if dev_name[pos+1..].chars().all(char::is_numeric) {
|
||||||
|
return dev_name[..pos].to_string();
|
||||||
match output {
|
}
|
||||||
Ok(result) if result.status.success() => {
|
|
||||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
|
||||||
|
|
||||||
// Parse health status
|
|
||||||
let health = if stdout.contains("PASSED") {
|
|
||||||
"PASSED".to_string()
|
|
||||||
} else if stdout.contains("FAILED") {
|
|
||||||
"FAILED".to_string()
|
|
||||||
} else {
|
|
||||||
"UNKNOWN".to_string()
|
|
||||||
};
|
|
||||||
|
|
||||||
// Parse temperature and wear level
|
|
||||||
let temperature = self.parse_temperature_from_smart(&stdout);
|
|
||||||
let wear_level = self.parse_wear_level_from_smart(&stdout);
|
|
||||||
|
|
||||||
(health, temperature, wear_level)
|
|
||||||
}
|
}
|
||||||
_ => {
|
// Handle traditional naming: sda1 -> sda
|
||||||
debug!("Failed to get SMART data for {}", device_path);
|
let mut result = String::new();
|
||||||
("UNKNOWN".to_string(), None, None)
|
for ch in dev_name.chars() {
|
||||||
|
if ch.is_ascii_digit() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result.push(ch);
|
||||||
|
}
|
||||||
|
if !result.is_empty() {
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
device.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse temperature from SMART output
|
/// Get SMART data for drives
|
||||||
fn parse_temperature_from_smart(&self, smart_output: &str) -> Option<f32> {
|
async fn get_smart_data_for_drives(&self, physical_drives: &[PhysicalDrive], mergerfs_pools: &[MergerfsPool]) -> HashMap<String, SmartData> {
|
||||||
for line in smart_output.lines() {
|
use tracing::info;
|
||||||
if line.contains("Temperature_Celsius") || line.contains("Temperature") {
|
let mut smart_data = HashMap::new();
|
||||||
|
|
||||||
|
// Collect all drive names
|
||||||
|
let mut all_drives = std::collections::HashSet::new();
|
||||||
|
for drive in physical_drives {
|
||||||
|
all_drives.insert(drive.name.clone());
|
||||||
|
}
|
||||||
|
for pool in mergerfs_pools {
|
||||||
|
for drive in &pool.data_drives {
|
||||||
|
all_drives.insert(drive.name.clone());
|
||||||
|
}
|
||||||
|
for drive in &pool.parity_drives {
|
||||||
|
all_drives.insert(drive.name.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Collecting SMART data for {} drives", all_drives.len());
|
||||||
|
|
||||||
|
// Get SMART data for each drive
|
||||||
|
for drive_name in &all_drives {
|
||||||
|
match self.get_smart_data(drive_name).await {
|
||||||
|
Ok(data) => {
|
||||||
|
info!("SMART data collected for {}: serial={:?}, temp={:?}, health={}",
|
||||||
|
drive_name, data.serial_number, data.temperature_celsius, data.health);
|
||||||
|
smart_data.insert(drive_name.clone(), data);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
info!("Failed to get SMART data for {}: {:?}", drive_name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("SMART data collection complete: {}/{} drives successful", smart_data.len(), all_drives.len());
|
||||||
|
smart_data
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get SMART data for a single drive
|
||||||
|
async fn get_smart_data(&self, drive_name: &str) -> Result<SmartData, CollectorError> {
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
// Use direct smartctl (no sudo) - service has CAP_SYS_RAWIO capability
|
||||||
|
let output = Command::new("smartctl")
|
||||||
|
.args(&["-a", &format!("/dev/{}", drive_name)])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: format!("SMART data for {}", drive_name),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let error_str = String::from_utf8_lossy(&output.stderr);
|
||||||
|
|
||||||
|
// Debug logging for SMART command results
|
||||||
|
debug!("SMART output for {}: status={}, stdout_len={}, stderr={}",
|
||||||
|
drive_name, output.status, output_str.len(), error_str);
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
info!("SMART command failed for {}, status={}, stderr={}", drive_name, output.status, error_str);
|
||||||
|
// Return unknown data rather than failing completely
|
||||||
|
return Ok(SmartData {
|
||||||
|
health: "UNKNOWN".to_string(),
|
||||||
|
serial_number: None,
|
||||||
|
temperature_celsius: None,
|
||||||
|
wear_percent: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut health = "UNKNOWN".to_string();
|
||||||
|
let mut serial_number = None;
|
||||||
|
let mut temperature = None;
|
||||||
|
let mut wear_percent = None;
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.contains("SMART overall-health") {
|
||||||
|
if line.contains("PASSED") {
|
||||||
|
health = "PASSED".to_string();
|
||||||
|
} else if line.contains("FAILED") {
|
||||||
|
health = "FAILED".to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serial number parsing (both SATA and NVMe)
|
||||||
|
if line.contains("Serial Number:") {
|
||||||
|
if let Some(serial_part) = line.split("Serial Number:").nth(1) {
|
||||||
|
let serial_str = serial_part.trim();
|
||||||
|
if !serial_str.is_empty() {
|
||||||
|
// Take first whitespace-separated token
|
||||||
|
if let Some(serial) = serial_str.split_whitespace().next() {
|
||||||
|
serial_number = Some(serial.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Temperature parsing for different drive types
|
||||||
|
if line.contains("Temperature_Celsius") || line.contains("Airflow_Temperature_Cel") || line.contains("Temperature_Case") {
|
||||||
|
// Traditional SATA drives: attribute table format
|
||||||
|
if let Some(temp_str) = line.split_whitespace().nth(9) {
|
||||||
|
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||||
|
temperature = Some(temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if line.starts_with("Temperature:") {
|
||||||
|
// NVMe drives: simple "Temperature: 27 Celsius" format
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
if parts.len() >= 10 {
|
if parts.len() >= 2 {
|
||||||
if let Ok(temp) = parts[9].parse::<f32>() {
|
if let Ok(temp) = parts[1].parse::<f32>() {
|
||||||
return Some(temp);
|
temperature = Some(temp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if line.contains("temperature:") {
|
|
||||||
if let Some(temp_part) = line.split("temperature:").nth(1) {
|
|
||||||
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
|
||||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
|
||||||
return Some(temp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse wear level from SMART output
|
// Wear level parsing for SSDs
|
||||||
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
if line.contains("Media_Wearout_Indicator") {
|
||||||
for line in smart_output.lines() {
|
// Media_Wearout_Indicator stores remaining life % in column 3 (VALUE)
|
||||||
|
if let Some(wear_str) = line.split_whitespace().nth(3) {
|
||||||
|
if let Ok(remaining) = wear_str.parse::<f32>() {
|
||||||
|
wear_percent = Some(100.0 - remaining); // Convert remaining life to wear
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if line.contains("Wear_Leveling_Count") || line.contains("SSD_Life_Left") {
|
||||||
|
// Other wear attributes store value in column 9 (RAW_VALUE)
|
||||||
|
if let Some(wear_str) = line.split_whitespace().nth(9) {
|
||||||
|
if let Ok(wear) = wear_str.parse::<f32>() {
|
||||||
|
wear_percent = Some(100.0 - wear); // Convert remaining life to wear
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// NVMe wear parsing: "Percentage Used: 1%"
|
||||||
if line.contains("Percentage Used:") {
|
if line.contains("Percentage Used:") {
|
||||||
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
if let Some(percent_part) = line.split("Percentage Used:").nth(1) {
|
||||||
if let Some(wear_str) = wear_part.split('%').next() {
|
if let Some(percent_str) = percent_part.split_whitespace().next() {
|
||||||
if let Ok(wear) = wear_str.trim().parse::<f32>() {
|
if let Some(percent_clean) = percent_str.strip_suffix('%') {
|
||||||
return Some(wear);
|
if let Ok(wear) = percent_clean.parse::<f32>() {
|
||||||
|
wear_percent = Some(wear);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
Ok(SmartData {
|
||||||
if parts.len() >= 10 {
|
health,
|
||||||
if line.contains("SSD_Life_Left") || line.contains("Percent_Lifetime_Remain") {
|
serial_number,
|
||||||
if let Ok(remaining) = parts[3].parse::<f32>() {
|
temperature_celsius: temperature,
|
||||||
return Some(100.0 - remaining);
|
wear_percent,
|
||||||
}
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Populate drives data into AgentData
|
||||||
|
fn populate_drives_data(&self, physical_drives: &[PhysicalDrive], smart_data: &HashMap<String, SmartData>, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
for drive in physical_drives {
|
||||||
|
let smart = smart_data.get(&drive.name);
|
||||||
|
|
||||||
|
let mut filesystems: Vec<FilesystemData> = drive.filesystems.iter().map(|fs| {
|
||||||
|
FilesystemData {
|
||||||
|
mount: fs.mount_point.clone(), // This preserves "/" and "/boot" correctly
|
||||||
|
usage_percent: fs.usage_percent,
|
||||||
|
used_gb: fs.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||||
|
total_gb: fs.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||||
|
usage_status: self.calculate_filesystem_usage_status(fs.usage_percent),
|
||||||
}
|
}
|
||||||
if line.contains("Wear_Leveling_Count") {
|
}).collect();
|
||||||
if let Ok(wear_count) = parts[3].parse::<f32>() {
|
|
||||||
if wear_count <= 100.0 {
|
// Sort filesystems by mount point for consistent display order
|
||||||
return Some(100.0 - wear_count);
|
filesystems.sort_by(|a, b| a.mount.cmp(&b.mount));
|
||||||
|
|
||||||
|
agent_data.system.storage.drives.push(DriveData {
|
||||||
|
name: drive.name.clone(),
|
||||||
|
serial_number: smart.and_then(|s| s.serial_number.clone()),
|
||||||
|
health: smart.map(|s| s.health.clone()).unwrap_or_else(|| drive.health.clone()),
|
||||||
|
temperature_celsius: smart.and_then(|s| s.temperature_celsius),
|
||||||
|
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||||
|
filesystems,
|
||||||
|
temperature_status: smart.and_then(|s| s.temperature_celsius)
|
||||||
|
.map(|temp| self.calculate_temperature_status(temp))
|
||||||
|
.unwrap_or(Status::Unknown),
|
||||||
|
health_status: self.calculate_health_status(
|
||||||
|
smart.map(|s| s.health.as_str()).unwrap_or("UNKNOWN")
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Populate pools data into AgentData
|
||||||
|
fn populate_pools_data(&self, mergerfs_pools: &[MergerfsPool], smart_data: &HashMap<String, SmartData>, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
for pool in mergerfs_pools {
|
||||||
|
// Calculate pool health and statuses based on member drive health
|
||||||
|
let (pool_health, health_status, usage_status, data_drive_data, parity_drive_data) = self.calculate_pool_health(pool, smart_data);
|
||||||
|
|
||||||
|
let pool_data = PoolData {
|
||||||
|
name: pool.name.clone(),
|
||||||
|
mount: pool.mount_point.clone(),
|
||||||
|
pool_type: format!("mergerfs ({}+{})", pool.data_drives.len(), pool.parity_drives.len()),
|
||||||
|
health: pool_health,
|
||||||
|
usage_percent: if pool.total_bytes > 0 {
|
||||||
|
(pool.used_bytes as f32 / pool.total_bytes as f32) * 100.0
|
||||||
|
} else { 0.0 },
|
||||||
|
used_gb: pool.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||||
|
total_gb: pool.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||||
|
data_drives: data_drive_data,
|
||||||
|
parity_drives: parity_drive_data,
|
||||||
|
health_status,
|
||||||
|
usage_status,
|
||||||
|
};
|
||||||
|
|
||||||
|
agent_data.system.storage.pools.push(pool_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate pool health based on member drive status
|
||||||
|
fn calculate_pool_health(&self, pool: &MergerfsPool, smart_data: &HashMap<String, SmartData>) -> (String, cm_dashboard_shared::Status, cm_dashboard_shared::Status, Vec<cm_dashboard_shared::PoolDriveData>, Vec<cm_dashboard_shared::PoolDriveData>) {
|
||||||
|
let mut failed_data = 0;
|
||||||
|
let mut failed_parity = 0;
|
||||||
|
|
||||||
|
// Process data drives
|
||||||
|
let data_drive_data: Vec<cm_dashboard_shared::PoolDriveData> = pool.data_drives.iter().map(|d| {
|
||||||
|
let smart = smart_data.get(&d.name);
|
||||||
|
let health = smart.map(|s| s.health.clone()).unwrap_or_else(|| "UNKNOWN".to_string());
|
||||||
|
let temperature = smart.and_then(|s| s.temperature_celsius).or(d.temperature_celsius);
|
||||||
|
|
||||||
|
if health == "FAILED" {
|
||||||
|
failed_data += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate drive statuses using config thresholds
|
||||||
|
let health_status = self.calculate_health_status(&health);
|
||||||
|
let temperature_status = temperature.map(|t| self.temperature_thresholds.evaluate(t)).unwrap_or(cm_dashboard_shared::Status::Unknown);
|
||||||
|
|
||||||
|
cm_dashboard_shared::PoolDriveData {
|
||||||
|
name: d.name.clone(),
|
||||||
|
serial_number: smart.and_then(|s| s.serial_number.clone()),
|
||||||
|
temperature_celsius: temperature,
|
||||||
|
health,
|
||||||
|
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||||
|
health_status,
|
||||||
|
temperature_status,
|
||||||
|
}
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
// Process parity drives
|
||||||
|
let parity_drive_data: Vec<cm_dashboard_shared::PoolDriveData> = pool.parity_drives.iter().map(|d| {
|
||||||
|
let smart = smart_data.get(&d.name);
|
||||||
|
let health = smart.map(|s| s.health.clone()).unwrap_or_else(|| "UNKNOWN".to_string());
|
||||||
|
let temperature = smart.and_then(|s| s.temperature_celsius).or(d.temperature_celsius);
|
||||||
|
|
||||||
|
if health == "FAILED" {
|
||||||
|
failed_parity += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate drive statuses using config thresholds
|
||||||
|
let health_status = self.calculate_health_status(&health);
|
||||||
|
let temperature_status = temperature.map(|t| self.temperature_thresholds.evaluate(t)).unwrap_or(cm_dashboard_shared::Status::Unknown);
|
||||||
|
|
||||||
|
cm_dashboard_shared::PoolDriveData {
|
||||||
|
name: d.name.clone(),
|
||||||
|
serial_number: smart.and_then(|s| s.serial_number.clone()),
|
||||||
|
temperature_celsius: temperature,
|
||||||
|
health,
|
||||||
|
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||||
|
health_status,
|
||||||
|
temperature_status,
|
||||||
|
}
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
// Calculate overall pool health string and status
|
||||||
|
// SnapRAID logic: can tolerate up to N parity drive failures (where N = number of parity drives)
|
||||||
|
// If data drives fail AND we've lost parity protection, that's critical
|
||||||
|
let (pool_health, health_status) = if failed_data == 0 && failed_parity == 0 {
|
||||||
|
("healthy".to_string(), cm_dashboard_shared::Status::Ok)
|
||||||
|
} else if failed_data == 0 && failed_parity > 0 {
|
||||||
|
// Parity failed but no data loss - degraded (reduced protection)
|
||||||
|
("degraded".to_string(), cm_dashboard_shared::Status::Warning)
|
||||||
|
} else if failed_data == 1 && failed_parity == 0 {
|
||||||
|
// One data drive failed, parity intact - degraded (recoverable)
|
||||||
|
("degraded".to_string(), cm_dashboard_shared::Status::Warning)
|
||||||
|
} else {
|
||||||
|
// Multiple data drives failed OR data+parity failed = data loss risk
|
||||||
|
("critical".to_string(), cm_dashboard_shared::Status::Critical)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Calculate pool usage status using config thresholds
|
||||||
|
let usage_percent = if pool.total_bytes > 0 {
|
||||||
|
(pool.used_bytes as f32 / pool.total_bytes as f32) * 100.0
|
||||||
|
} else { 0.0 };
|
||||||
|
|
||||||
|
let usage_status = if usage_percent >= self.config.usage_critical_percent {
|
||||||
|
cm_dashboard_shared::Status::Critical
|
||||||
|
} else if usage_percent >= self.config.usage_warning_percent {
|
||||||
|
cm_dashboard_shared::Status::Warning
|
||||||
|
} else {
|
||||||
|
cm_dashboard_shared::Status::Ok
|
||||||
|
};
|
||||||
|
|
||||||
|
(pool_health, health_status, usage_status, data_drive_data, parity_drive_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate filesystem usage status
|
||||||
|
fn calculate_filesystem_usage_status(&self, usage_percent: f32) -> Status {
|
||||||
|
// Use standard filesystem warning/critical thresholds
|
||||||
|
if usage_percent >= 95.0 {
|
||||||
|
Status::Critical
|
||||||
|
} else if usage_percent >= 85.0 {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate drive temperature status
|
||||||
|
fn calculate_temperature_status(&self, temperature: f32) -> Status {
|
||||||
|
self.temperature_thresholds.evaluate(temperature)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate drive health status
|
||||||
|
fn calculate_health_status(&self, health: &str) -> Status {
|
||||||
|
match health {
|
||||||
|
"PASSED" => Status::Ok,
|
||||||
|
"FAILED" => Status::Critical,
|
||||||
|
_ => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Discover parity drives that are related to the given data drives
|
||||||
|
fn discover_related_parity_drives(&self, data_drives: &[String]) -> anyhow::Result<Vec<String>> {
|
||||||
|
let mount_devices = tokio::task::block_in_place(|| {
|
||||||
|
tokio::runtime::Handle::current().block_on(self.get_mount_devices())
|
||||||
|
}).map_err(|e| anyhow::anyhow!("Failed to get mount devices: {}", e))?;
|
||||||
|
|
||||||
|
let mut related_parity = Vec::new();
|
||||||
|
|
||||||
|
// Find parity drives that share the same parent directory as the data drives
|
||||||
|
for data_path in data_drives {
|
||||||
|
if let Some(parent_dir) = self.get_parent_directory(data_path) {
|
||||||
|
// Look for parity drives in the same parent directory
|
||||||
|
for (mount_point, _device) in &mount_devices {
|
||||||
|
if mount_point.contains("parity") && mount_point.starts_with(&parent_dir) {
|
||||||
|
if !related_parity.contains(mount_point) {
|
||||||
|
related_parity.push(mount_point.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(related_parity)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get parent directory of a mount path (e.g., "/mnt/disk1" -> "/mnt")
|
||||||
|
fn get_parent_directory(&self, path: &str) -> Option<String> {
|
||||||
|
if let Some(last_slash) = path.rfind('/') {
|
||||||
|
if last_slash > 0 {
|
||||||
|
return Some(path[..last_slash].to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate temperature status with hysteresis
|
/// Categorize pool member drives as data vs parity
|
||||||
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
fn categorize_pool_drives(&self, member_paths: &[String]) -> anyhow::Result<(Vec<PoolDrive>, Vec<PoolDrive>)> {
|
||||||
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
let mut data_drives = Vec::new();
|
||||||
}
|
let mut parity_drives = Vec::new();
|
||||||
|
|
||||||
/// Convert bytes to human readable format
|
for path in member_paths {
|
||||||
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
let drive_info = self.get_drive_info_for_path(path)?;
|
||||||
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
|
||||||
let mut size = bytes as f64;
|
|
||||||
let mut unit_index = 0;
|
|
||||||
|
|
||||||
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
// Heuristic: if path contains "parity", it's parity
|
||||||
size /= 1024.0;
|
if path.to_lowercase().contains("parity") {
|
||||||
unit_index += 1;
|
parity_drives.push(drive_info);
|
||||||
|
} else {
|
||||||
|
data_drives.push(drive_info);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if unit_index == 0 {
|
Ok((data_drives, parity_drives))
|
||||||
format!("{:.0}{}", size, UNITS[unit_index])
|
}
|
||||||
|
|
||||||
|
/// Get drive information for a mount path
|
||||||
|
fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> {
|
||||||
|
// Use lsblk to find the backing device
|
||||||
|
let output = Command::new("lsblk")
|
||||||
|
.args(&["-rn", "-o", "NAME,MOUNTPOINT"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let mut device = String::new();
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if parts.len() >= 2 && parts[1] == path {
|
||||||
|
device = parts[0].to_string();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if device.is_empty() {
|
||||||
|
return Err(anyhow::anyhow!("Could not find device for path {}", path));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract base device name (e.g., "sda1" -> "sda")
|
||||||
|
let base_device = self.extract_base_device(&format!("/dev/{}", device));
|
||||||
|
|
||||||
|
// Get temperature from SMART data if available
|
||||||
|
let temperature = if let Ok(smart_data) = tokio::task::block_in_place(|| {
|
||||||
|
tokio::runtime::Handle::current().block_on(self.get_smart_data(&base_device))
|
||||||
|
}) {
|
||||||
|
smart_data.temperature_celsius
|
||||||
} else {
|
} else {
|
||||||
format!("{:.1}{}", size, UNITS[unit_index])
|
None
|
||||||
}
|
};
|
||||||
|
|
||||||
|
Ok(PoolDrive {
|
||||||
|
name: base_device,
|
||||||
|
mount_point: path.to_string(),
|
||||||
|
temperature_celsius: temperature,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert bytes to gigabytes
|
/// Resolve numeric mergerfs references like "1:2" to actual mount paths
|
||||||
fn bytes_to_gb(&self, bytes: u64) -> f32 {
|
fn resolve_numeric_mergerfs_paths(&self, numeric_refs: &[String]) -> anyhow::Result<Vec<String>> {
|
||||||
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
let mut resolved_paths = Vec::new();
|
||||||
|
|
||||||
|
// Get all mount points that look like /mnt/disk* or /mnt/parity*
|
||||||
|
let mount_devices = tokio::task::block_in_place(|| {
|
||||||
|
tokio::runtime::Handle::current().block_on(self.get_mount_devices())
|
||||||
|
}).map_err(|e| anyhow::anyhow!("Failed to get mount devices: {}", e))?;
|
||||||
|
|
||||||
|
let mut disk_mounts: Vec<String> = mount_devices.keys()
|
||||||
|
.filter(|path| path.starts_with("/mnt/disk") || path.starts_with("/mnt/parity"))
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
disk_mounts.sort(); // Ensure consistent ordering
|
||||||
|
|
||||||
|
for num_ref in numeric_refs {
|
||||||
|
if let Ok(index) = num_ref.parse::<usize>() {
|
||||||
|
// Convert 1-based index to 0-based
|
||||||
|
if index > 0 && index <= disk_mounts.len() {
|
||||||
|
resolved_paths.push(disk_mounts[index - 1].clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: if we couldn't resolve, return the original paths
|
||||||
|
if resolved_paths.is_empty() {
|
||||||
|
resolved_paths = numeric_refs.to_vec();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(resolved_paths)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Collector for DiskCollector {
|
impl Collector for DiskCollector {
|
||||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
let start_time = Instant::now();
|
self.collect_storage_data(agent_data).await
|
||||||
debug!("Starting clean storage collection");
|
|
||||||
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
||||||
|
|
||||||
// Discover storage topology
|
|
||||||
let topology = match self.discover_storage() {
|
|
||||||
Ok(topology) => topology,
|
|
||||||
Err(e) => {
|
|
||||||
debug!("Storage discovery failed: {}", e);
|
|
||||||
return Ok(metrics);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Generate metrics for physical drives
|
|
||||||
for drive in &topology.physical_drives {
|
|
||||||
self.generate_physical_drive_metrics(&mut metrics, drive, timestamp, status_tracker);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate metrics for mergerfs pools
|
|
||||||
for pool in &topology.mergerfs_pools {
|
|
||||||
self.generate_mergerfs_pool_metrics(&mut metrics, pool, timestamp, status_tracker);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add total storage count
|
|
||||||
let total_storage = topology.physical_drives.len() + topology.mergerfs_pools.len();
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "disk_count".to_string(),
|
|
||||||
value: MetricValue::Integer(total_storage as i64),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("Total storage: {} drives, {} pools", topology.physical_drives.len(), topology.mergerfs_pools.len())),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
let collection_time = start_time.elapsed();
|
|
||||||
debug!("Clean storage collection completed in {:?} with {} metrics", collection_time, metrics.len());
|
|
||||||
|
|
||||||
Ok(metrics)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DiskCollector {
|
/// SMART data for a drive
|
||||||
/// Generate metrics for a physical drive and its filesystems
|
#[derive(Debug, Clone)]
|
||||||
fn generate_physical_drive_metrics(
|
struct SmartData {
|
||||||
&self,
|
health: String,
|
||||||
metrics: &mut Vec<Metric>,
|
serial_number: Option<String>,
|
||||||
drive: &PhysicalDrive,
|
temperature_celsius: Option<f32>,
|
||||||
timestamp: u64,
|
wear_percent: Option<f32>,
|
||||||
status_tracker: &mut StatusTracker
|
|
||||||
) {
|
|
||||||
let drive_name = &drive.device;
|
|
||||||
|
|
||||||
// Calculate drive totals
|
|
||||||
let total_capacity: u64 = drive.filesystems.iter().map(|fs| fs.total_bytes).sum();
|
|
||||||
let total_used: u64 = drive.filesystems.iter().map(|fs| fs.used_bytes).sum();
|
|
||||||
let total_available = total_capacity.saturating_sub(total_used);
|
|
||||||
let usage_percent = if total_capacity > 0 {
|
|
||||||
(total_used as f64 / total_capacity as f64) * 100.0
|
|
||||||
} else { 0.0 };
|
|
||||||
|
|
||||||
// Drive health status
|
|
||||||
let health_status = if drive.health_status == "PASSED" { Status::Ok }
|
|
||||||
else if drive.health_status == "FAILED" { Status::Critical }
|
|
||||||
else { Status::Unknown };
|
|
||||||
|
|
||||||
// Usage status
|
|
||||||
let usage_status = if usage_percent >= self.config.usage_critical_percent as f64 {
|
|
||||||
Status::Critical
|
|
||||||
} else if usage_percent >= self.config.usage_warning_percent as f64 {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
};
|
|
||||||
|
|
||||||
let drive_status = if health_status == Status::Critical { Status::Critical } else { usage_status };
|
|
||||||
|
|
||||||
// Drive info metrics
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_health", drive_name),
|
|
||||||
value: MetricValue::String(drive.health_status.clone()),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("{}: {}", drive_name, drive.health_status)),
|
|
||||||
status: health_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Drive temperature
|
|
||||||
if let Some(temp) = drive.temperature {
|
|
||||||
let temp_status = self.calculate_temperature_status(
|
|
||||||
&format!("disk_{}_temperature", drive_name), temp, status_tracker
|
|
||||||
);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_temperature", drive_name),
|
|
||||||
value: MetricValue::Float(temp),
|
|
||||||
unit: Some("°C".to_string()),
|
|
||||||
description: Some(format!("{}: {:.0}°C", drive_name, temp)),
|
|
||||||
status: temp_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Drive wear level
|
|
||||||
if let Some(wear) = drive.wear_level {
|
|
||||||
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
|
||||||
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
|
||||||
else { Status::Ok };
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_wear_percent", drive_name),
|
|
||||||
value: MetricValue::Float(wear),
|
|
||||||
unit: Some("%".to_string()),
|
|
||||||
description: Some(format!("{}: {:.0}% wear", drive_name, wear)),
|
|
||||||
status: wear_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Drive capacity metrics
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_total_gb", drive_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(total_capacity)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("{}: {}", drive_name, self.bytes_to_human_readable(total_capacity))),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_used_gb", drive_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(total_used)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("{}: {}", drive_name, self.bytes_to_human_readable(total_used))),
|
|
||||||
status: drive_status.clone(),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_available_gb", drive_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(total_available)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("{}: {}", drive_name, self.bytes_to_human_readable(total_available))),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_usage_percent", drive_name),
|
|
||||||
value: MetricValue::Float(usage_percent as f32),
|
|
||||||
unit: Some("%".to_string()),
|
|
||||||
description: Some(format!("{}: {:.1}%", drive_name, usage_percent)),
|
|
||||||
status: drive_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Pool type indicator
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_pool_type", drive_name),
|
|
||||||
value: MetricValue::String(format!("drive ({})", drive.filesystems.len())),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("Type: physical drive")),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Individual filesystem metrics
|
|
||||||
for filesystem in &drive.filesystems {
|
|
||||||
let fs_name = if filesystem.mount_point == "/" {
|
|
||||||
"root".to_string()
|
|
||||||
} else {
|
|
||||||
filesystem.mount_point.trim_start_matches('/').replace('/', "_")
|
|
||||||
};
|
|
||||||
|
|
||||||
let fs_usage_percent = if filesystem.total_bytes > 0 {
|
|
||||||
(filesystem.used_bytes as f64 / filesystem.total_bytes as f64) * 100.0
|
|
||||||
} else { 0.0 };
|
|
||||||
|
|
||||||
let fs_status = if fs_usage_percent >= self.config.usage_critical_percent as f64 {
|
|
||||||
Status::Critical
|
|
||||||
} else if fs_usage_percent >= self.config.usage_warning_percent as f64 {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
};
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_fs_{}_usage_percent", drive_name, fs_name),
|
|
||||||
value: MetricValue::Float(fs_usage_percent as f32),
|
|
||||||
unit: Some("%".to_string()),
|
|
||||||
description: Some(format!("{}: {:.0}%", filesystem.mount_point, fs_usage_percent)),
|
|
||||||
status: fs_status.clone(),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_fs_{}_used_gb", drive_name, fs_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(filesystem.used_bytes)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("{}: {}", filesystem.mount_point, self.bytes_to_human_readable(filesystem.used_bytes))),
|
|
||||||
status: fs_status.clone(),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_fs_{}_total_gb", drive_name, fs_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(filesystem.total_bytes)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("{}: {}", filesystem.mount_point, self.bytes_to_human_readable(filesystem.total_bytes))),
|
|
||||||
status: fs_status.clone(),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
let fs_available = filesystem.total_bytes.saturating_sub(filesystem.used_bytes);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_fs_{}_available_gb", drive_name, fs_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(fs_available)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("{}: {}", filesystem.mount_point, self.bytes_to_human_readable(fs_available))),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_fs_{}_mount_point", drive_name, fs_name),
|
|
||||||
value: MetricValue::String(filesystem.mount_point.clone()),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("Mount: {}", filesystem.mount_point)),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generate metrics for a mergerfs pool
|
|
||||||
fn generate_mergerfs_pool_metrics(
|
|
||||||
&self,
|
|
||||||
metrics: &mut Vec<Metric>,
|
|
||||||
pool: &MergerfsPool,
|
|
||||||
timestamp: u64,
|
|
||||||
status_tracker: &mut StatusTracker
|
|
||||||
) {
|
|
||||||
let pool_name = pool.mount_point.trim_start_matches('/').replace('/', "_");
|
|
||||||
if pool_name.is_empty() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let usage_percent = if pool.total_bytes > 0 {
|
|
||||||
(pool.used_bytes as f64 / pool.total_bytes as f64) * 100.0
|
|
||||||
} else { 0.0 };
|
|
||||||
|
|
||||||
// Calculate pool health based on drive health
|
|
||||||
let failed_data = pool.data_drives.iter()
|
|
||||||
.filter(|d| d.health_status != "PASSED")
|
|
||||||
.count();
|
|
||||||
let failed_parity = pool.parity_drives.iter()
|
|
||||||
.filter(|d| d.health_status != "PASSED")
|
|
||||||
.count();
|
|
||||||
|
|
||||||
let pool_health = match (failed_data, failed_parity) {
|
|
||||||
(0, 0) => Status::Ok,
|
|
||||||
(1, 0) | (0, 1) => Status::Warning,
|
|
||||||
_ => Status::Critical,
|
|
||||||
};
|
|
||||||
|
|
||||||
let usage_status = if usage_percent >= self.config.usage_critical_percent as f64 {
|
|
||||||
Status::Critical
|
|
||||||
} else if usage_percent >= self.config.usage_warning_percent as f64 {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
|
||||||
};
|
|
||||||
|
|
||||||
let pool_status = if pool_health == Status::Critical { Status::Critical } else { usage_status };
|
|
||||||
|
|
||||||
// Pool metrics
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_mount_point", pool_name),
|
|
||||||
value: MetricValue::String(pool.mount_point.clone()),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("Mount: {}", pool.mount_point)),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_pool_type", pool_name),
|
|
||||||
value: MetricValue::String(format!("mergerfs ({}+{})", pool.data_drives.len(), pool.parity_drives.len())),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Type: mergerfs".to_string()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_pool_health", pool_name),
|
|
||||||
value: MetricValue::String(match pool_health {
|
|
||||||
Status::Ok => "healthy".to_string(),
|
|
||||||
Status::Warning => "degraded".to_string(),
|
|
||||||
Status::Critical => "critical".to_string(),
|
|
||||||
_ => "unknown".to_string(),
|
|
||||||
}),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Pool health".to_string()),
|
|
||||||
status: pool_health,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_total_gb", pool_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(pool.total_bytes)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("Total: {}", self.bytes_to_human_readable(pool.total_bytes))),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_used_gb", pool_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(pool.used_bytes)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("Used: {}", self.bytes_to_human_readable(pool.used_bytes))),
|
|
||||||
status: pool_status.clone(),
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
let available_bytes = pool.total_bytes.saturating_sub(pool.used_bytes);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_available_gb", pool_name),
|
|
||||||
value: MetricValue::Float(self.bytes_to_gb(available_bytes)),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("Available: {}", self.bytes_to_human_readable(available_bytes))),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_usage_percent", pool_name),
|
|
||||||
value: MetricValue::Float(usage_percent as f32),
|
|
||||||
unit: Some("%".to_string()),
|
|
||||||
description: Some(format!("Usage: {:.1}%", usage_percent)),
|
|
||||||
status: pool_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Individual drive metrics
|
|
||||||
for (i, drive) in pool.data_drives.iter().enumerate() {
|
|
||||||
self.generate_pool_drive_metrics(metrics, &pool_name, &format!("data_{}", i), drive, timestamp, status_tracker);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i, drive) in pool.parity_drives.iter().enumerate() {
|
|
||||||
self.generate_pool_drive_metrics(metrics, &pool_name, &format!("parity_{}", i), drive, timestamp, status_tracker);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generate metrics for drives in mergerfs pools
|
|
||||||
fn generate_pool_drive_metrics(
|
|
||||||
&self,
|
|
||||||
metrics: &mut Vec<Metric>,
|
|
||||||
pool_name: &str,
|
|
||||||
drive_role: &str,
|
|
||||||
drive: &DriveInfo,
|
|
||||||
timestamp: u64,
|
|
||||||
status_tracker: &mut StatusTracker
|
|
||||||
) {
|
|
||||||
let drive_health = if drive.health_status == "PASSED" { Status::Ok }
|
|
||||||
else if drive.health_status == "FAILED" { Status::Critical }
|
|
||||||
else { Status::Unknown };
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_{}_health", pool_name, drive_role),
|
|
||||||
value: MetricValue::String(drive.health_status.clone()),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("{}: {}", drive.device, drive.health_status)),
|
|
||||||
status: drive_health,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
if let Some(temp) = drive.temperature {
|
|
||||||
let temp_status = self.calculate_temperature_status(
|
|
||||||
&format!("disk_{}_{}_temperature", pool_name, drive_role), temp, status_tracker
|
|
||||||
);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_{}_temperature", pool_name, drive_role),
|
|
||||||
value: MetricValue::Float(temp),
|
|
||||||
unit: Some("°C".to_string()),
|
|
||||||
description: Some(format!("{}: {:.0}°C", drive.device, temp)),
|
|
||||||
status: temp_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(wear) = drive.wear_level {
|
|
||||||
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
|
||||||
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
|
||||||
else { Status::Ok };
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("disk_{}_{}_wear_percent", pool_name, drive_role),
|
|
||||||
value: MetricValue::Float(wear),
|
|
||||||
unit: Some("%".to_string()),
|
|
||||||
description: Some(format!("{}: {:.0}% wear", drive.device, wear)),
|
|
||||||
status: wear_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
use cm_dashboard_shared::{AgentData, TmpfsData, HysteresisThresholds, Status};
|
||||||
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
@@ -10,34 +10,19 @@ use crate::config::MemoryConfig;
|
|||||||
///
|
///
|
||||||
/// EFFICIENCY OPTIMIZATIONS:
|
/// EFFICIENCY OPTIMIZATIONS:
|
||||||
/// - Single /proc/meminfo read for all memory metrics
|
/// - Single /proc/meminfo read for all memory metrics
|
||||||
/// - Minimal string parsing with split operations
|
/// - Minimal string allocations
|
||||||
/// - Pre-calculated KB to GB conversion
|
/// - No process spawning for basic metrics
|
||||||
/// - No regex or complex parsing
|
/// - <0.5ms collection time target
|
||||||
/// - <0.1ms collection time target
|
|
||||||
pub struct MemoryCollector {
|
pub struct MemoryCollector {
|
||||||
usage_thresholds: HysteresisThresholds,
|
usage_thresholds: HysteresisThresholds,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Memory information parsed from /proc/meminfo
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
struct MemoryInfo {
|
|
||||||
total_kb: u64,
|
|
||||||
available_kb: u64,
|
|
||||||
free_kb: u64,
|
|
||||||
buffers_kb: u64,
|
|
||||||
cached_kb: u64,
|
|
||||||
swap_total_kb: u64,
|
|
||||||
swap_free_kb: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MemoryCollector {
|
impl MemoryCollector {
|
||||||
pub fn new(config: MemoryConfig) -> Self {
|
pub fn new(config: MemoryConfig) -> Self {
|
||||||
// Create hysteresis thresholds with 5% gap for memory usage
|
// Create hysteresis thresholds with 10% gap for recovery
|
||||||
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
|
let usage_thresholds = HysteresisThresholds::new(
|
||||||
config.usage_warning_percent,
|
config.usage_warning_percent,
|
||||||
5.0, // 5% gap for warning recovery
|
|
||||||
config.usage_critical_percent,
|
config.usage_critical_percent,
|
||||||
5.0, // 5% gap for critical recovery
|
|
||||||
);
|
);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
@@ -45,11 +30,6 @@ impl MemoryCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate memory usage status using hysteresis thresholds
|
|
||||||
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
|
|
||||||
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse /proc/meminfo efficiently
|
/// Parse /proc/meminfo efficiently
|
||||||
/// Format: "MemTotal: 16384000 kB"
|
/// Format: "MemTotal: 16384000 kB"
|
||||||
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
||||||
@@ -96,212 +76,141 @@ impl MemoryCollector {
|
|||||||
Ok(info)
|
Ok(info)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert KB to GB efficiently (avoiding floating point in hot path)
|
/// Populate memory data directly into AgentData
|
||||||
fn kb_to_gb(kb: u64) -> f32 {
|
async fn populate_memory_data(&self, info: &MemoryInfo, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
kb as f32 / 1_048_576.0 // 1024 * 1024
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Calculate memory metrics from parsed info
|
|
||||||
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
|
|
||||||
let mut metrics = Vec::with_capacity(6);
|
|
||||||
|
|
||||||
// Calculate derived values
|
// Calculate derived values
|
||||||
let used_kb = info.total_kb - info.available_kb;
|
let available = info.available_kb;
|
||||||
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
let used = info.total_kb - available;
|
||||||
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
|
let usage_percent = (used as f32 / info.total_kb as f32) * 100.0;
|
||||||
|
|
||||||
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
// Populate basic memory fields
|
||||||
|
agent_data.system.memory.usage_percent = usage_percent;
|
||||||
|
agent_data.system.memory.total_gb = info.total_kb as f32 / (1024.0 * 1024.0);
|
||||||
|
agent_data.system.memory.used_gb = used as f32 / (1024.0 * 1024.0);
|
||||||
|
|
||||||
// Convert to GB for metrics
|
// Populate swap data if available
|
||||||
let total_gb = Self::kb_to_gb(info.total_kb);
|
agent_data.system.memory.swap_total_gb = info.swap_total_kb as f32 / (1024.0 * 1024.0);
|
||||||
let used_gb = Self::kb_to_gb(used_kb);
|
agent_data.system.memory.swap_used_gb = (info.swap_total_kb - info.swap_free_kb) as f32 / (1024.0 * 1024.0);
|
||||||
let available_gb = Self::kb_to_gb(info.available_kb);
|
|
||||||
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
|
|
||||||
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
|
||||||
|
|
||||||
// Memory usage percentage (primary metric with status)
|
Ok(())
|
||||||
metrics.push(
|
|
||||||
Metric::new(
|
|
||||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
|
||||||
MetricValue::Float(usage_percent),
|
|
||||||
usage_status,
|
|
||||||
)
|
|
||||||
.with_description("Memory usage percentage".to_string())
|
|
||||||
.with_unit("%".to_string()),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Total memory
|
|
||||||
metrics.push(
|
|
||||||
Metric::new(
|
|
||||||
registry::MEMORY_TOTAL_GB.to_string(),
|
|
||||||
MetricValue::Float(total_gb),
|
|
||||||
Status::Ok, // Total memory doesn't have status
|
|
||||||
)
|
|
||||||
.with_description("Total system memory".to_string())
|
|
||||||
.with_unit("GB".to_string()),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Used memory
|
|
||||||
metrics.push(
|
|
||||||
Metric::new(
|
|
||||||
registry::MEMORY_USED_GB.to_string(),
|
|
||||||
MetricValue::Float(used_gb),
|
|
||||||
Status::Ok, // Used memory absolute value doesn't have status
|
|
||||||
)
|
|
||||||
.with_description("Used system memory".to_string())
|
|
||||||
.with_unit("GB".to_string()),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Available memory
|
|
||||||
metrics.push(
|
|
||||||
Metric::new(
|
|
||||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
|
||||||
MetricValue::Float(available_gb),
|
|
||||||
Status::Ok, // Available memory absolute value doesn't have status
|
|
||||||
)
|
|
||||||
.with_description("Available system memory".to_string())
|
|
||||||
.with_unit("GB".to_string()),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Swap metrics (only if swap exists)
|
|
||||||
if info.swap_total_kb > 0 {
|
|
||||||
metrics.push(
|
|
||||||
Metric::new(
|
|
||||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
|
||||||
MetricValue::Float(swap_total_gb),
|
|
||||||
Status::Ok,
|
|
||||||
)
|
|
||||||
.with_description("Total swap space".to_string())
|
|
||||||
.with_unit("GB".to_string()),
|
|
||||||
);
|
|
||||||
|
|
||||||
metrics.push(
|
|
||||||
Metric::new(
|
|
||||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
|
||||||
MetricValue::Float(swap_used_gb),
|
|
||||||
Status::Ok,
|
|
||||||
)
|
|
||||||
.with_description("Used swap space".to_string())
|
|
||||||
.with_unit("GB".to_string()),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Monitor tmpfs (/tmp) usage
|
|
||||||
if let Ok(tmpfs_metrics) = self.get_tmpfs_metrics(status_tracker) {
|
|
||||||
metrics.extend(tmpfs_metrics);
|
|
||||||
}
|
|
||||||
|
|
||||||
metrics
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get tmpfs (/tmp) usage metrics
|
/// Populate tmpfs data into AgentData
|
||||||
fn get_tmpfs_metrics(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
async fn populate_tmpfs_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
use std::process::Command;
|
// Discover all tmpfs mount points
|
||||||
|
let tmpfs_mounts = self.discover_tmpfs_mounts()?;
|
||||||
|
|
||||||
let output = Command::new("df")
|
if tmpfs_mounts.is_empty() {
|
||||||
.arg("--block-size=1")
|
debug!("No tmpfs mounts found to monitor");
|
||||||
.arg("/tmp")
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get usage data for all tmpfs mounts at once using df
|
||||||
|
let mut df_args = vec!["df", "--output=target,size,used", "--block-size=1"];
|
||||||
|
df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str()));
|
||||||
|
|
||||||
|
let df_output = std::process::Command::new(df_args[0])
|
||||||
|
.args(&df_args[1..])
|
||||||
.output()
|
.output()
|
||||||
.map_err(|e| CollectorError::SystemRead {
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
path: "/tmp".to_string(),
|
path: "tmpfs mounts".to_string(),
|
||||||
error: e.to_string(),
|
error: e.to_string(),
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
if !output.status.success() {
|
let df_str = String::from_utf8_lossy(&df_output.stdout);
|
||||||
return Ok(Vec::new()); // Return empty if /tmp not available
|
let df_lines: Vec<&str> = df_str.lines().skip(1).collect(); // Skip header
|
||||||
|
|
||||||
|
// Process each tmpfs mount
|
||||||
|
for (i, mount_point) in tmpfs_mounts.iter().enumerate() {
|
||||||
|
if i >= df_lines.len() {
|
||||||
|
debug!("Not enough df output lines for tmpfs mount: {}", mount_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parts: Vec<&str> = df_lines[i].split_whitespace().collect();
|
||||||
|
if parts.len() < 3 {
|
||||||
|
debug!("Invalid df output for tmpfs mount: {}", mount_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_bytes: u64 = parts[1].parse().unwrap_or(0);
|
||||||
|
let used_bytes: u64 = parts[2].parse().unwrap_or(0);
|
||||||
|
|
||||||
|
if total_bytes == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||||
|
let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||||
|
let usage_percent = (used_bytes as f32 / total_bytes as f32) * 100.0;
|
||||||
|
|
||||||
|
// Add to tmpfs list
|
||||||
|
agent_data.system.memory.tmpfs.push(TmpfsData {
|
||||||
|
mount: mount_point.clone(),
|
||||||
|
usage_percent,
|
||||||
|
used_gb,
|
||||||
|
total_gb,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let output_str = String::from_utf8(output.stdout)
|
// Sort tmpfs mounts by mount point for consistent display order
|
||||||
.map_err(|e| CollectorError::Parse {
|
agent_data.system.memory.tmpfs.sort_by(|a, b| a.mount.cmp(&b.mount));
|
||||||
value: "df output".to_string(),
|
|
||||||
error: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let lines: Vec<&str> = output_str.lines().collect();
|
Ok(())
|
||||||
if lines.len() < 2 {
|
}
|
||||||
return Ok(Vec::new());
|
|
||||||
|
/// Discover all tmpfs mount points from /proc/mounts
|
||||||
|
fn discover_tmpfs_mounts(&self) -> Result<Vec<String>, CollectorError> {
|
||||||
|
let content = utils::read_proc_file("/proc/mounts")?;
|
||||||
|
let mut tmpfs_mounts = Vec::new();
|
||||||
|
|
||||||
|
for line in content.lines() {
|
||||||
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if fields.len() >= 3 && fields[2] == "tmpfs" {
|
||||||
|
let mount_point = fields[1];
|
||||||
|
|
||||||
|
// Filter out system/internal tmpfs mounts that aren't useful for monitoring
|
||||||
|
if self.should_monitor_tmpfs(mount_point) {
|
||||||
|
tmpfs_mounts.push(mount_point.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
debug!("Discovered {} tmpfs mounts: {:?}", tmpfs_mounts.len(), tmpfs_mounts);
|
||||||
if fields.len() < 4 {
|
Ok(tmpfs_mounts)
|
||||||
return Ok(Vec::new());
|
}
|
||||||
}
|
|
||||||
|
|
||||||
let total_bytes: u64 = fields[1].parse()
|
/// Determine if a tmpfs mount point should be monitored
|
||||||
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
fn should_monitor_tmpfs(&self, mount_point: &str) -> bool {
|
||||||
value: fields[1].to_string(),
|
// Include commonly useful tmpfs mounts
|
||||||
error: e.to_string(),
|
matches!(mount_point,
|
||||||
})?;
|
"/tmp" | "/var/tmp" | "/dev/shm" | "/run" | "/var/log"
|
||||||
let used_bytes: u64 = fields[2].parse()
|
) || mount_point.starts_with("/run/user/") // User session tmpfs
|
||||||
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
}
|
||||||
value: fields[2].to_string(),
|
|
||||||
error: e.to_string(),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
/// Calculate memory usage status based on thresholds
|
||||||
let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
fn calculate_memory_status(&self, usage_percent: f32) -> Status {
|
||||||
let usage_percent = if total_bytes > 0 {
|
self.usage_thresholds.evaluate(usage_percent)
|
||||||
(used_bytes as f32 / total_bytes as f32) * 100.0
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
||||||
|
|
||||||
// Calculate status using same thresholds as main memory
|
|
||||||
let tmp_status = self.calculate_usage_status("memory_tmp_usage_percent", usage_percent, status_tracker);
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "memory_tmp_usage_percent".to_string(),
|
|
||||||
value: MetricValue::Float(usage_percent),
|
|
||||||
unit: Some("%".to_string()),
|
|
||||||
description: Some("tmpfs /tmp usage percentage".to_string()),
|
|
||||||
status: tmp_status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "memory_tmp_used_gb".to_string(),
|
|
||||||
value: MetricValue::Float(used_gb),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some("tmpfs /tmp used space".to_string()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "memory_tmp_total_gb".to_string(),
|
|
||||||
value: MetricValue::Float(total_gb),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some("tmpfs /tmp total space".to_string()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(metrics)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Collector for MemoryCollector {
|
impl Collector for MemoryCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
|
||||||
debug!("Collecting memory metrics");
|
debug!("Collecting memory metrics");
|
||||||
let start = std::time::Instant::now();
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
// Parse memory info from /proc/meminfo
|
// Parse memory info from /proc/meminfo
|
||||||
let info = self.parse_meminfo().await?;
|
let info = self.parse_meminfo().await?;
|
||||||
|
|
||||||
// Calculate all metrics from parsed info
|
// Populate memory data directly
|
||||||
let metrics = self.calculate_metrics(&info, status_tracker);
|
self.populate_memory_data(&info, agent_data).await?;
|
||||||
|
|
||||||
|
// Collect tmpfs data
|
||||||
|
self.populate_tmpfs_data(agent_data).await?;
|
||||||
|
|
||||||
let duration = start.elapsed();
|
let duration = start.elapsed();
|
||||||
debug!(
|
debug!("Memory collection completed in {:?}", duration);
|
||||||
"Memory collection completed in {:?} with {} metrics",
|
|
||||||
duration,
|
|
||||||
metrics.len()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Efficiency check: warn if collection takes too long
|
// Efficiency check: warn if collection takes too long
|
||||||
if duration.as_millis() > 1 {
|
if duration.as_millis() > 1 {
|
||||||
@@ -311,10 +220,21 @@ impl Collector for MemoryCollector {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store performance metrics
|
// Calculate status using thresholds
|
||||||
// Performance tracking handled by cache system
|
agent_data.system.memory.usage_status = self.calculate_memory_status(agent_data.system.memory.usage_percent);
|
||||||
|
|
||||||
Ok(metrics)
|
Ok(())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal structure for parsing /proc/meminfo
|
||||||
|
#[derive(Default)]
|
||||||
|
struct MemoryInfo {
|
||||||
|
total_kb: u64,
|
||||||
|
available_kb: u64,
|
||||||
|
free_kb: u64,
|
||||||
|
buffers_kb: u64,
|
||||||
|
cached_kb: u64,
|
||||||
|
swap_total_kb: u64,
|
||||||
|
swap_free_kb: u64,
|
||||||
}
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, StatusTracker};
|
use cm_dashboard_shared::{AgentData};
|
||||||
|
|
||||||
|
|
||||||
pub mod backup;
|
pub mod backup;
|
||||||
@@ -7,19 +7,18 @@ pub mod cpu;
|
|||||||
pub mod disk;
|
pub mod disk;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod memory;
|
pub mod memory;
|
||||||
|
pub mod network;
|
||||||
pub mod nixos;
|
pub mod nixos;
|
||||||
pub mod systemd;
|
pub mod systemd;
|
||||||
|
|
||||||
pub use error::CollectorError;
|
pub use error::CollectorError;
|
||||||
|
|
||||||
|
|
||||||
/// Base trait for all collectors with extreme efficiency requirements
|
/// Base trait for all collectors with direct structured data output
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait Collector: Send + Sync {
|
pub trait Collector: Send + Sync {
|
||||||
/// Collect all metrics this collector provides
|
/// Collect data and populate AgentData directly with status evaluation
|
||||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError>;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// CPU efficiency rules for all collectors
|
/// CPU efficiency rules for all collectors
|
||||||
|
|||||||
224
agent/src/collectors/network.rs
Normal file
224
agent/src/collectors/network.rs
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{AgentData, NetworkInterfaceData, Status};
|
||||||
|
use std::process::Command;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{Collector, CollectorError};
|
||||||
|
use crate::config::NetworkConfig;
|
||||||
|
|
||||||
|
/// Network interface collector with physical/virtual classification and link status
|
||||||
|
pub struct NetworkCollector {
|
||||||
|
_config: NetworkConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NetworkCollector {
|
||||||
|
pub fn new(config: NetworkConfig) -> Self {
|
||||||
|
Self { _config: config }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if interface is physical (not virtual)
|
||||||
|
fn is_physical_interface(name: &str) -> bool {
|
||||||
|
// Physical interface patterns
|
||||||
|
matches!(
|
||||||
|
&name[..],
|
||||||
|
s if s.starts_with("eth")
|
||||||
|
|| s.starts_with("ens")
|
||||||
|
|| s.starts_with("enp")
|
||||||
|
|| s.starts_with("wlan")
|
||||||
|
|| s.starts_with("wlp")
|
||||||
|
|| s.starts_with("eno")
|
||||||
|
|| s.starts_with("enx")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get link status for an interface
|
||||||
|
fn get_link_status(interface: &str) -> Status {
|
||||||
|
let operstate_path = format!("/sys/class/net/{}/operstate", interface);
|
||||||
|
|
||||||
|
match std::fs::read_to_string(&operstate_path) {
|
||||||
|
Ok(state) => {
|
||||||
|
let state = state.trim();
|
||||||
|
match state {
|
||||||
|
"up" => Status::Ok,
|
||||||
|
"down" => Status::Inactive,
|
||||||
|
"unknown" => Status::Warning,
|
||||||
|
_ => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the primary physical interface (the one with default route)
|
||||||
|
fn get_primary_physical_interface() -> Option<String> {
|
||||||
|
match Command::new("ip").args(["route", "show", "default"]).output() {
|
||||||
|
Ok(output) if output.status.success() => {
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
// Parse: "default via 192.168.1.1 dev eno1 ..."
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.starts_with("default") {
|
||||||
|
if let Some(dev_pos) = line.find(" dev ") {
|
||||||
|
let after_dev = &line[dev_pos + 5..];
|
||||||
|
if let Some(space_pos) = after_dev.find(' ') {
|
||||||
|
let interface = &after_dev[..space_pos];
|
||||||
|
// Only return if it's a physical interface
|
||||||
|
if Self::is_physical_interface(interface) {
|
||||||
|
return Some(interface.to_string());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No space after interface name (end of line)
|
||||||
|
let interface = after_dev.trim();
|
||||||
|
if Self::is_physical_interface(interface) {
|
||||||
|
return Some(interface.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse VLAN configuration from /proc/net/vlan/config
|
||||||
|
/// Returns a map of interface name -> VLAN ID
|
||||||
|
fn parse_vlan_config() -> std::collections::HashMap<String, u16> {
|
||||||
|
let mut vlan_map = std::collections::HashMap::new();
|
||||||
|
|
||||||
|
if let Ok(contents) = std::fs::read_to_string("/proc/net/vlan/config") {
|
||||||
|
for line in contents.lines().skip(2) { // Skip header lines
|
||||||
|
let parts: Vec<&str> = line.split('|').collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
let interface_name = parts[0].trim();
|
||||||
|
let vlan_id_str = parts[1].trim();
|
||||||
|
|
||||||
|
if let Ok(vlan_id) = vlan_id_str.parse::<u16>() {
|
||||||
|
vlan_map.insert(interface_name.to_string(), vlan_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vlan_map
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect network interfaces using ip command
|
||||||
|
async fn collect_interfaces(&self) -> Vec<NetworkInterfaceData> {
|
||||||
|
let mut interfaces = Vec::new();
|
||||||
|
|
||||||
|
// Parse VLAN configuration
|
||||||
|
let vlan_map = Self::parse_vlan_config();
|
||||||
|
|
||||||
|
match Command::new("ip").args(["-j", "addr"]).output() {
|
||||||
|
Ok(output) if output.status.success() => {
|
||||||
|
let json_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
if let Ok(json_data) = serde_json::from_str::<serde_json::Value>(&json_str) {
|
||||||
|
if let Some(ifaces) = json_data.as_array() {
|
||||||
|
for iface in ifaces {
|
||||||
|
let name = iface["ifname"].as_str().unwrap_or("").to_string();
|
||||||
|
|
||||||
|
// Skip loopback, empty names, and ifb* interfaces
|
||||||
|
if name.is_empty() || name == "lo" || name.starts_with("ifb") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse parent interface from @parent notation (e.g., lan@enp0s31f6)
|
||||||
|
let (interface_name, parent_interface) = if let Some(at_pos) = name.find('@') {
|
||||||
|
let (child, parent) = name.split_at(at_pos);
|
||||||
|
(child.to_string(), Some(parent[1..].to_string()))
|
||||||
|
} else {
|
||||||
|
(name.clone(), None)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut ipv4_addresses = Vec::new();
|
||||||
|
let mut ipv6_addresses = Vec::new();
|
||||||
|
|
||||||
|
// Extract IP addresses
|
||||||
|
if let Some(addr_info) = iface["addr_info"].as_array() {
|
||||||
|
for addr in addr_info {
|
||||||
|
if let Some(family) = addr["family"].as_str() {
|
||||||
|
if let Some(local) = addr["local"].as_str() {
|
||||||
|
match family {
|
||||||
|
"inet" => ipv4_addresses.push(local.to_string()),
|
||||||
|
"inet6" => {
|
||||||
|
// Skip link-local IPv6 addresses (fe80::)
|
||||||
|
if !local.starts_with("fe80:") {
|
||||||
|
ipv6_addresses.push(local.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine if physical and get status
|
||||||
|
let is_physical = Self::is_physical_interface(&interface_name);
|
||||||
|
|
||||||
|
// Only filter out virtual interfaces without IPs
|
||||||
|
// Physical interfaces should always be shown even if down/no IPs
|
||||||
|
if !is_physical && ipv4_addresses.is_empty() && ipv6_addresses.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let link_status = if is_physical {
|
||||||
|
Self::get_link_status(&name)
|
||||||
|
} else {
|
||||||
|
Status::Unknown // Virtual interfaces don't have meaningful link status
|
||||||
|
};
|
||||||
|
|
||||||
|
// Look up VLAN ID from the map (use original name before @ parsing)
|
||||||
|
let vlan_id = vlan_map.get(&name).copied();
|
||||||
|
|
||||||
|
interfaces.push(NetworkInterfaceData {
|
||||||
|
name: interface_name,
|
||||||
|
ipv4_addresses,
|
||||||
|
ipv6_addresses,
|
||||||
|
is_physical,
|
||||||
|
link_status,
|
||||||
|
parent_interface,
|
||||||
|
vlan_id,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to execute ip command: {}", e);
|
||||||
|
}
|
||||||
|
Ok(output) => {
|
||||||
|
debug!("ip command failed with status: {}", output.status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assign primary physical interface as parent to virtual interfaces without explicit parent
|
||||||
|
let primary_interface = Self::get_primary_physical_interface();
|
||||||
|
if let Some(primary) = primary_interface {
|
||||||
|
for interface in interfaces.iter_mut() {
|
||||||
|
// Only assign parent to virtual interfaces that don't already have one
|
||||||
|
if !interface.is_physical && interface.parent_interface.is_none() {
|
||||||
|
interface.parent_interface = Some(primary.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
interfaces
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for NetworkCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
debug!("Collecting network interface data");
|
||||||
|
|
||||||
|
// Collect all network interfaces
|
||||||
|
let interfaces = self.collect_interfaces().await;
|
||||||
|
|
||||||
|
agent_data.system.network.interfaces = interfaces;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,172 +1,111 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
use cm_dashboard_shared::AgentData;
|
||||||
|
use std::fs;
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::{Collector, CollectorError};
|
use super::{Collector, CollectorError};
|
||||||
use crate::config::NixOSConfig;
|
|
||||||
|
|
||||||
/// NixOS system information collector
|
/// NixOS system information collector with structured data output
|
||||||
///
|
///
|
||||||
/// Collects NixOS-specific system information including:
|
/// This collector gathers NixOS-specific information like:
|
||||||
/// - NixOS version and build information
|
/// - System generation/build information
|
||||||
pub struct NixOSCollector {
|
/// - Version information
|
||||||
}
|
/// - Agent version from Nix store path
|
||||||
|
pub struct NixOSCollector;
|
||||||
|
|
||||||
impl NixOSCollector {
|
impl NixOSCollector {
|
||||||
pub fn new(_config: NixOSConfig) -> Self {
|
pub fn new(_config: crate::config::NixOSConfig) -> Self {
|
||||||
Self {}
|
Self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Collect NixOS system information and populate AgentData
|
||||||
|
async fn collect_nixos_info(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
debug!("Collecting NixOS system information");
|
||||||
|
|
||||||
/// Get agent hash from binary path
|
// Set hostname (this is universal, not NixOS-specific)
|
||||||
fn get_agent_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
agent_data.hostname = self.get_hostname().await.unwrap_or_else(|| "unknown".to_string());
|
||||||
// Get the path of the current executable
|
|
||||||
let exe_path = std::env::current_exe()?;
|
|
||||||
let exe_str = exe_path.to_string_lossy();
|
|
||||||
|
|
||||||
// Extract Nix store hash from path like /nix/store/fn804fh332mp8gz06qawminpj20xl25h-cm-dashboard-0.1.0/bin/cm-dashboard-agent
|
// Set agent version from environment or Nix store path
|
||||||
if let Some(store_path) = exe_str.strip_prefix("/nix/store/") {
|
agent_data.agent_version = self.get_agent_version().await;
|
||||||
if let Some(dash_pos) = store_path.find('-') {
|
|
||||||
return Ok(store_path[..dash_pos].to_string());
|
// Set NixOS build/generation information
|
||||||
|
agent_data.build_version = self.get_nixos_generation().await;
|
||||||
|
|
||||||
|
// Set current timestamp
|
||||||
|
agent_data.timestamp = chrono::Utc::now().timestamp() as u64;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get system hostname
|
||||||
|
async fn get_hostname(&self) -> Option<String> {
|
||||||
|
match fs::read_to_string("/etc/hostname") {
|
||||||
|
Ok(hostname) => Some(hostname.trim().to_string()),
|
||||||
|
Err(_) => {
|
||||||
|
// Fallback to hostname command
|
||||||
|
match Command::new("hostname").output() {
|
||||||
|
Ok(output) => Some(String::from_utf8_lossy(&output.stdout).trim().to_string()),
|
||||||
|
Err(_) => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get agent version from Nix store path or environment
|
||||||
|
async fn get_agent_version(&self) -> String {
|
||||||
|
// Try to extract version from the current executable path (Nix store)
|
||||||
|
if let Ok(current_exe) = std::env::current_exe() {
|
||||||
|
if let Some(exe_path) = current_exe.to_str() {
|
||||||
|
if exe_path.starts_with("/nix/store/") {
|
||||||
|
// Extract version from Nix store path
|
||||||
|
// Path format: /nix/store/hash-cm-dashboard-agent-v0.1.138/bin/cm-dashboard-agent
|
||||||
|
if let Some(store_part) = exe_path.strip_prefix("/nix/store/") {
|
||||||
|
if let Some(dash_pos) = store_part.find('-') {
|
||||||
|
let package_part = &store_part[dash_pos + 1..];
|
||||||
|
if let Some(bin_pos) = package_part.find("/bin/") {
|
||||||
|
let package_name = &package_part[..bin_pos];
|
||||||
|
// Extract version from package name
|
||||||
|
if let Some(version_start) = package_name.rfind("-v") {
|
||||||
|
return package_name[version_start + 1..].to_string();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to "unknown" if not in Nix store
|
// Fallback to environment variable or default
|
||||||
Ok("unknown".to_string())
|
std::env::var("CM_DASHBOARD_VERSION").unwrap_or_else(|_| "unknown".to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get configuration hash from deployed nix store system
|
/// Get NixOS system generation (build) information from git commit
|
||||||
/// Get git commit hash from rebuild process
|
async fn get_nixos_generation(&self) -> Option<String> {
|
||||||
fn get_git_commit(&self) -> Result<String, Box<dyn std::error::Error>> {
|
// Try to read git commit hash from file written during rebuild
|
||||||
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
||||||
match std::fs::read_to_string(commit_file) {
|
match fs::read_to_string(commit_file) {
|
||||||
Ok(content) => {
|
Ok(content) => {
|
||||||
let commit_hash = content.trim();
|
let commit_hash = content.trim();
|
||||||
if commit_hash.len() >= 7 {
|
if commit_hash.len() >= 7 {
|
||||||
Ok(commit_hash.to_string())
|
debug!("Found git commit hash: {}", commit_hash);
|
||||||
|
Some(commit_hash.to_string())
|
||||||
} else {
|
} else {
|
||||||
Err("Git commit hash too short".into())
|
debug!("Git commit hash too short: {}", commit_hash);
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => Err(format!("Failed to read git commit file: {}", e).into())
|
Err(e) => {
|
||||||
}
|
debug!("Failed to read git commit file {}: {}", commit_file, e);
|
||||||
}
|
None
|
||||||
|
|
||||||
fn get_config_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
|
||||||
// Read the symlink target of /run/current-system to get nix store path
|
|
||||||
let output = Command::new("readlink")
|
|
||||||
.arg("/run/current-system")
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
return Err("readlink command failed".into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let binding = String::from_utf8_lossy(&output.stdout);
|
|
||||||
let store_path = binding.trim();
|
|
||||||
|
|
||||||
// Extract hash from nix store path
|
|
||||||
// Format: /nix/store/HASH-nixos-system-HOSTNAME-VERSION
|
|
||||||
if let Some(hash_part) = store_path.strip_prefix("/nix/store/") {
|
|
||||||
if let Some(hash) = hash_part.split('-').next() {
|
|
||||||
if hash.len() >= 8 {
|
|
||||||
// Return first 8 characters of nix store hash
|
|
||||||
return Ok(hash[..8].to_string());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Err("Could not extract hash from nix store path".into())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Collector for NixOSCollector {
|
impl Collector for NixOSCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
self.collect_nixos_info(agent_data).await
|
||||||
debug!("Collecting NixOS system information");
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
||||||
|
|
||||||
// Collect git commit information (shows what's actually deployed)
|
|
||||||
match self.get_git_commit() {
|
|
||||||
Ok(git_commit) => {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "system_nixos_build".to_string(),
|
|
||||||
value: MetricValue::String(git_commit),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Git commit hash of deployed configuration".to_string()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
debug!("Failed to get git commit: {}", e);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "system_nixos_build".to_string(),
|
|
||||||
value: MetricValue::String("unknown".to_string()),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Git commit hash (failed to detect)".to_string()),
|
|
||||||
status: Status::Unknown,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Collect config hash
|
|
||||||
match self.get_config_hash() {
|
|
||||||
Ok(hash) => {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "system_config_hash".to_string(),
|
|
||||||
value: MetricValue::String(hash),
|
|
||||||
unit: None,
|
|
||||||
description: Some("NixOS deployed configuration hash".to_string()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
debug!("Failed to get config hash: {}", e);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "system_config_hash".to_string(),
|
|
||||||
value: MetricValue::String("unknown".to_string()),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Deployed config hash (failed to detect)".to_string()),
|
|
||||||
status: Status::Unknown,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect agent hash
|
|
||||||
match self.get_agent_hash() {
|
|
||||||
Ok(hash) => {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "system_agent_hash".to_string(),
|
|
||||||
value: MetricValue::String(hash),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Agent Nix store hash".to_string()),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
debug!("Failed to get agent hash: {}", e);
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: "system_agent_hash".to_string(),
|
|
||||||
value: MetricValue::String("unknown".to_string()),
|
|
||||||
unit: None,
|
|
||||||
description: Some("Agent hash (failed to detect)".to_string()),
|
|
||||||
status: Status::Unknown,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("Collected {} NixOS metrics", metrics.len());
|
|
||||||
Ok(metrics)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
use cm_dashboard_shared::{AgentData, ServiceData, SubServiceData, SubServiceMetric, Status};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@@ -9,7 +9,7 @@ use tracing::debug;
|
|||||||
use super::{Collector, CollectorError};
|
use super::{Collector, CollectorError};
|
||||||
use crate::config::SystemdConfig;
|
use crate::config::SystemdConfig;
|
||||||
|
|
||||||
/// Systemd collector for monitoring systemd services
|
/// Systemd collector for monitoring systemd services with structured data output
|
||||||
pub struct SystemdCollector {
|
pub struct SystemdCollector {
|
||||||
/// Cached state with thread-safe interior mutability
|
/// Cached state with thread-safe interior mutability
|
||||||
state: RwLock<ServiceCacheState>,
|
state: RwLock<ServiceCacheState>,
|
||||||
@@ -18,18 +18,22 @@ pub struct SystemdCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Internal state for service caching
|
/// Internal state for service caching
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
struct ServiceCacheState {
|
struct ServiceCacheState {
|
||||||
|
/// Last collection time for performance tracking
|
||||||
|
last_collection: Option<Instant>,
|
||||||
|
/// Cached complete service data with sub-services
|
||||||
|
cached_service_data: Vec<ServiceData>,
|
||||||
/// Interesting services to monitor (cached after discovery)
|
/// Interesting services to monitor (cached after discovery)
|
||||||
monitored_services: Vec<String>,
|
monitored_services: Vec<String>,
|
||||||
/// Cached service status information from discovery
|
/// Cached service status information from discovery
|
||||||
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||||
/// Last time services were discovered
|
/// Last time services were discovered
|
||||||
last_discovery_time: Option<Instant>,
|
last_discovery_time: Option<Instant>,
|
||||||
/// How often to rediscover services (5 minutes)
|
/// How often to rediscover services (from config)
|
||||||
discovery_interval_seconds: u64,
|
discovery_interval_seconds: u64,
|
||||||
/// Cached nginx site latency metrics
|
/// Cached nginx site latency metrics
|
||||||
nginx_site_metrics: Vec<Metric>,
|
nginx_site_metrics: Vec<(String, f32)>,
|
||||||
/// Last time nginx sites were checked
|
/// Last time nginx sites were checked
|
||||||
last_nginx_check_time: Option<Instant>,
|
last_nginx_check_time: Option<Instant>,
|
||||||
/// How often to check nginx site latency (configurable)
|
/// How often to check nginx site latency (configurable)
|
||||||
@@ -46,20 +50,138 @@ struct ServiceStatusInfo {
|
|||||||
|
|
||||||
impl SystemdCollector {
|
impl SystemdCollector {
|
||||||
pub fn new(config: SystemdConfig) -> Self {
|
pub fn new(config: SystemdConfig) -> Self {
|
||||||
|
let state = ServiceCacheState {
|
||||||
|
last_collection: None,
|
||||||
|
cached_service_data: Vec::new(),
|
||||||
|
monitored_services: Vec::new(),
|
||||||
|
service_status_cache: std::collections::HashMap::new(),
|
||||||
|
last_discovery_time: None,
|
||||||
|
discovery_interval_seconds: config.interval_seconds,
|
||||||
|
nginx_site_metrics: Vec::new(),
|
||||||
|
last_nginx_check_time: None,
|
||||||
|
nginx_check_interval_seconds: config.nginx_check_interval_seconds,
|
||||||
|
};
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
state: RwLock::new(ServiceCacheState {
|
state: RwLock::new(state),
|
||||||
monitored_services: Vec::new(),
|
|
||||||
service_status_cache: std::collections::HashMap::new(),
|
|
||||||
last_discovery_time: None,
|
|
||||||
discovery_interval_seconds: config.interval_seconds,
|
|
||||||
nginx_site_metrics: Vec::new(),
|
|
||||||
last_nginx_check_time: None,
|
|
||||||
nginx_check_interval_seconds: config.nginx_check_interval_seconds,
|
|
||||||
}),
|
|
||||||
config,
|
config,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Collect service data and populate AgentData
|
||||||
|
async fn collect_service_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
let start_time = Instant::now();
|
||||||
|
debug!("Collecting systemd services metrics");
|
||||||
|
|
||||||
|
// Get cached services (discovery only happens when needed)
|
||||||
|
let monitored_services = match self.get_monitored_services() {
|
||||||
|
Ok(services) => services,
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get monitored services: {}", e);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Collect service data for each monitored service
|
||||||
|
let mut complete_service_data = Vec::new();
|
||||||
|
for service_name in &monitored_services {
|
||||||
|
match self.get_service_status(service_name) {
|
||||||
|
Ok((active_status, _detailed_info)) => {
|
||||||
|
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||||
|
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
||||||
|
|
||||||
|
let mut sub_services = Vec::new();
|
||||||
|
|
||||||
|
// Sub-service metrics for specific services (always include cached results)
|
||||||
|
if service_name.contains("nginx") && active_status == "active" {
|
||||||
|
let nginx_sites = self.get_nginx_site_metrics();
|
||||||
|
for (site_name, latency_ms) in nginx_sites {
|
||||||
|
let site_status = if latency_ms >= 0.0 && latency_ms < self.config.nginx_latency_critical_ms {
|
||||||
|
"active"
|
||||||
|
} else {
|
||||||
|
"failed"
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
metrics.push(SubServiceMetric {
|
||||||
|
label: "latency_ms".to_string(),
|
||||||
|
value: latency_ms,
|
||||||
|
unit: Some("ms".to_string()),
|
||||||
|
});
|
||||||
|
|
||||||
|
sub_services.push(SubServiceData {
|
||||||
|
name: site_name.clone(),
|
||||||
|
service_status: self.calculate_service_status(&site_name, &site_status),
|
||||||
|
metrics,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if service_name.contains("docker") && active_status == "active" {
|
||||||
|
let docker_containers = self.get_docker_containers();
|
||||||
|
for (container_name, container_status) in docker_containers {
|
||||||
|
// For now, docker containers have no additional metrics
|
||||||
|
// Future: could add memory_mb, cpu_percent, restart_count, etc.
|
||||||
|
let metrics = Vec::new();
|
||||||
|
|
||||||
|
sub_services.push(SubServiceData {
|
||||||
|
name: container_name.clone(),
|
||||||
|
service_status: self.calculate_service_status(&container_name, &container_status),
|
||||||
|
metrics,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add Docker images
|
||||||
|
let docker_images = self.get_docker_images();
|
||||||
|
for (image_name, image_status, image_size) in docker_images {
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
metrics.push(SubServiceMetric {
|
||||||
|
label: "size".to_string(),
|
||||||
|
value: 0.0, // Size as string in name instead
|
||||||
|
unit: None,
|
||||||
|
});
|
||||||
|
|
||||||
|
sub_services.push(SubServiceData {
|
||||||
|
name: format!("{} ({})", image_name, image_size),
|
||||||
|
service_status: self.calculate_service_status(&image_name, &image_status),
|
||||||
|
metrics,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create complete service data
|
||||||
|
let service_data = ServiceData {
|
||||||
|
name: service_name.clone(),
|
||||||
|
memory_mb,
|
||||||
|
disk_gb,
|
||||||
|
user_stopped: false, // TODO: Integrate with service tracker
|
||||||
|
service_status: self.calculate_service_status(service_name, &active_status),
|
||||||
|
sub_services,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Add to AgentData and cache
|
||||||
|
agent_data.services.push(service_data.clone());
|
||||||
|
complete_service_data.push(service_data);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get status for service {}: {}", service_name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update cached state
|
||||||
|
{
|
||||||
|
let mut state = self.state.write().unwrap();
|
||||||
|
state.last_collection = Some(start_time);
|
||||||
|
state.cached_service_data = complete_service_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
let elapsed = start_time.elapsed();
|
||||||
|
debug!("Systemd collection completed in {:?} with {} services", elapsed, agent_data.services.len());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Get monitored services, discovering them if needed or cache is expired
|
/// Get monitored services, discovering them if needed or cache is expired
|
||||||
fn get_monitored_services(&self) -> Result<Vec<String>> {
|
fn get_monitored_services(&self) -> Result<Vec<String>> {
|
||||||
// Check if we need discovery without holding the lock
|
// Check if we need discovery without holding the lock
|
||||||
@@ -76,25 +198,19 @@ impl SystemdCollector {
|
|||||||
|
|
||||||
if needs_discovery {
|
if needs_discovery {
|
||||||
debug!("Discovering systemd services (cache expired or first run)");
|
debug!("Discovering systemd services (cache expired or first run)");
|
||||||
// Call discover_services_internal which doesn't update state
|
|
||||||
match self.discover_services_internal() {
|
match self.discover_services_internal() {
|
||||||
Ok((services, status_cache)) => {
|
Ok((services, status_cache)) => {
|
||||||
// Update state with discovered services in a separate scope
|
|
||||||
if let Ok(mut state) = self.state.write() {
|
if let Ok(mut state) = self.state.write() {
|
||||||
state.monitored_services = services.clone();
|
state.monitored_services = services.clone();
|
||||||
state.service_status_cache = status_cache;
|
state.service_status_cache = status_cache;
|
||||||
state.last_discovery_time = Some(Instant::now());
|
state.last_discovery_time = Some(Instant::now());
|
||||||
debug!(
|
debug!("Auto-discovered {} services to monitor: {:?}",
|
||||||
"Auto-discovered {} services to monitor: {:?}",
|
state.monitored_services.len(), state.monitored_services);
|
||||||
state.monitored_services.len(),
|
|
||||||
state.monitored_services
|
|
||||||
);
|
|
||||||
return Ok(services);
|
return Ok(services);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
debug!("Failed to discover services, using cached list: {}", e);
|
debug!("Failed to discover services, using cached list: {}", e);
|
||||||
// Continue with existing cached services if discovery fails
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -104,8 +220,8 @@ impl SystemdCollector {
|
|||||||
Ok(state.monitored_services.clone())
|
Ok(state.monitored_services.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get nginx site metrics, checking them if cache is expired
|
/// Get nginx site metrics, checking them if cache is expired (like old working version)
|
||||||
fn get_nginx_site_metrics(&self) -> Vec<Metric> {
|
fn get_nginx_site_metrics(&self) -> Vec<(String, f32)> {
|
||||||
let mut state = self.state.write().unwrap();
|
let mut state = self.state.write().unwrap();
|
||||||
|
|
||||||
// Check if we need to refresh nginx site metrics
|
// Check if we need to refresh nginx site metrics
|
||||||
@@ -120,11 +236,7 @@ impl SystemdCollector {
|
|||||||
if needs_refresh {
|
if needs_refresh {
|
||||||
// Only check nginx sites if nginx service is active
|
// Only check nginx sites if nginx service is active
|
||||||
if state.monitored_services.iter().any(|s| s.contains("nginx")) {
|
if state.monitored_services.iter().any(|s| s.contains("nginx")) {
|
||||||
debug!(
|
let fresh_metrics = self.get_nginx_sites_internal();
|
||||||
"Refreshing nginx site latency metrics (interval: {}s)",
|
|
||||||
state.nginx_check_interval_seconds
|
|
||||||
);
|
|
||||||
let fresh_metrics = self.get_nginx_sites();
|
|
||||||
state.nginx_site_metrics = fresh_metrics;
|
state.nginx_site_metrics = fresh_metrics;
|
||||||
state.last_nginx_check_time = Some(Instant::now());
|
state.last_nginx_check_time = Some(Instant::now());
|
||||||
}
|
}
|
||||||
@@ -133,16 +245,11 @@ impl SystemdCollector {
|
|||||||
state.nginx_site_metrics.clone()
|
state.nginx_site_metrics.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Auto-discover interesting services to monitor (internal version that doesn't update state)
|
/// Auto-discover interesting services to monitor
|
||||||
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
||||||
debug!("Starting systemd service discovery with status caching");
|
// First: Get all service unit files
|
||||||
|
|
||||||
// First: Get all service unit files (includes services that have never been started)
|
|
||||||
let unit_files_output = Command::new("systemctl")
|
let unit_files_output = Command::new("systemctl")
|
||||||
.arg("list-unit-files")
|
.args(&["list-unit-files", "--type=service", "--no-pager", "--plain"])
|
||||||
.arg("--type=service")
|
|
||||||
.arg("--no-pager")
|
|
||||||
.arg("--plain")
|
|
||||||
.output()?;
|
.output()?;
|
||||||
|
|
||||||
if !unit_files_output.status.success() {
|
if !unit_files_output.status.success() {
|
||||||
@@ -151,11 +258,7 @@ impl SystemdCollector {
|
|||||||
|
|
||||||
// Second: Get runtime status of all units
|
// Second: Get runtime status of all units
|
||||||
let units_status_output = Command::new("systemctl")
|
let units_status_output = Command::new("systemctl")
|
||||||
.arg("list-units")
|
.args(&["list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
||||||
.arg("--type=service")
|
|
||||||
.arg("--all")
|
|
||||||
.arg("--no-pager")
|
|
||||||
.arg("--plain")
|
|
||||||
.output()?;
|
.output()?;
|
||||||
|
|
||||||
if !units_status_output.status.success() {
|
if !units_status_output.status.success() {
|
||||||
@@ -166,19 +269,16 @@ impl SystemdCollector {
|
|||||||
let units_status_str = String::from_utf8(units_status_output.stdout)?;
|
let units_status_str = String::from_utf8(units_status_output.stdout)?;
|
||||||
let mut services = Vec::new();
|
let mut services = Vec::new();
|
||||||
|
|
||||||
// Use configuration instead of hardcoded values
|
|
||||||
let excluded_services = &self.config.excluded_services;
|
let excluded_services = &self.config.excluded_services;
|
||||||
let service_name_filters = &self.config.service_name_filters;
|
let service_name_filters = &self.config.service_name_filters;
|
||||||
|
|
||||||
// Parse all service unit files to get complete service list
|
// Parse all service unit files
|
||||||
let mut all_service_names = std::collections::HashSet::new();
|
let mut all_service_names = std::collections::HashSet::new();
|
||||||
|
|
||||||
for line in unit_files_str.lines() {
|
for line in unit_files_str.lines() {
|
||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||||
let service_name = fields[0].trim_end_matches(".service");
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
all_service_names.insert(service_name.to_string());
|
all_service_names.insert(service_name.to_string());
|
||||||
debug!("Found service unit file: {}", service_name);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -188,20 +288,15 @@ impl SystemdCollector {
|
|||||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||||
let service_name = fields[0].trim_end_matches(".service");
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
|
|
||||||
// Extract status information from systemctl list-units output
|
|
||||||
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||||
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||||
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||||
|
|
||||||
// Cache the status information
|
|
||||||
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||||
load_state: load_state.clone(),
|
load_state,
|
||||||
active_state: active_state.clone(),
|
active_state,
|
||||||
sub_state: sub_state.clone(),
|
sub_state,
|
||||||
});
|
});
|
||||||
|
|
||||||
debug!("Got runtime status for service: {} (load:{}, active:{}, sub:{})", service_name, load_state, active_state, sub_state);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -213,108 +308,34 @@ impl SystemdCollector {
|
|||||||
active_state: "inactive".to_string(),
|
active_state: "inactive".to_string(),
|
||||||
sub_state: "dead".to_string(),
|
sub_state: "dead".to_string(),
|
||||||
});
|
});
|
||||||
debug!("Service {} found in unit files but not runtime - marked as inactive", service_name);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process all discovered services and apply filters
|
||||||
// Now process all discovered services
|
|
||||||
for service_name in &all_service_names {
|
for service_name in &all_service_names {
|
||||||
debug!("Processing service: '{}'", service_name);
|
// Skip excluded services first
|
||||||
|
let mut is_excluded = false;
|
||||||
// Skip excluded services first
|
for excluded in excluded_services {
|
||||||
let mut is_excluded = false;
|
if service_name.contains(excluded) {
|
||||||
for excluded in excluded_services {
|
is_excluded = true;
|
||||||
if service_name.contains(excluded) {
|
break;
|
||||||
debug!(
|
|
||||||
"EXCLUDING service '{}' because it matches pattern '{}'",
|
|
||||||
service_name, excluded
|
|
||||||
);
|
|
||||||
is_excluded = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_excluded {
|
|
||||||
debug!("Skipping excluded service: '{}'", service_name);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if this service matches our filter patterns (supports wildcards)
|
|
||||||
for pattern in service_name_filters {
|
|
||||||
if self.matches_pattern(service_name, pattern) {
|
|
||||||
debug!(
|
|
||||||
"INCLUDING service '{}' because it matches pattern '{}'",
|
|
||||||
service_name, pattern
|
|
||||||
);
|
|
||||||
services.push(service_name.to_string());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("Service discovery completed: found {} matching services: {:?}", services.len(), services);
|
|
||||||
if services.is_empty() {
|
|
||||||
debug!("No services found matching the configured filters - this may indicate a parsing issue");
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((services, status_cache))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if service name matches pattern (supports wildcards like nginx*)
|
|
||||||
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
|
||||||
if pattern.contains('*') {
|
|
||||||
// Wildcard pattern matching
|
|
||||||
if pattern.ends_with('*') {
|
|
||||||
// Pattern like "nginx*" - match if service starts with "nginx"
|
|
||||||
let prefix = &pattern[..pattern.len() - 1];
|
|
||||||
service_name.starts_with(prefix)
|
|
||||||
} else if pattern.starts_with('*') {
|
|
||||||
// Pattern like "*backup" - match if service ends with "backup"
|
|
||||||
let suffix = &pattern[1..];
|
|
||||||
service_name.ends_with(suffix)
|
|
||||||
} else {
|
|
||||||
// Pattern like "nginx*backup" - simple glob matching
|
|
||||||
self.simple_glob_match(service_name, pattern)
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Exact match (existing behavior)
|
|
||||||
service_name == pattern
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Simple glob pattern matching for patterns with * in middle
|
if is_excluded {
|
||||||
fn simple_glob_match(&self, text: &str, pattern: &str) -> bool {
|
|
||||||
let parts: Vec<&str> = pattern.split('*').collect();
|
|
||||||
if parts.is_empty() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut pos = 0;
|
|
||||||
for (i, part) in parts.iter().enumerate() {
|
|
||||||
if part.is_empty() {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if i == 0 {
|
// Check if this service matches our filter patterns (supports wildcards)
|
||||||
// First part must match at start
|
for pattern in service_name_filters {
|
||||||
if !text[pos..].starts_with(part) {
|
if self.matches_pattern(service_name, pattern) {
|
||||||
return false;
|
services.push(service_name.to_string());
|
||||||
}
|
break;
|
||||||
pos += part.len();
|
|
||||||
} else if i == parts.len() - 1 {
|
|
||||||
// Last part must match at end
|
|
||||||
return text[pos..].ends_with(part);
|
|
||||||
} else {
|
|
||||||
// Middle part must be found somewhere
|
|
||||||
if let Some(found_pos) = text[pos..].find(part) {
|
|
||||||
pos += found_pos + part.len();
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
true
|
|
||||||
|
Ok((services, status_cache))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get service status from cache (if available) or fallback to systemctl
|
/// Get service status from cache (if available) or fallback to systemctl
|
||||||
@@ -333,76 +354,111 @@ impl SystemdCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to systemctl if not in cache (shouldn't happen during normal operation)
|
// Fallback to systemctl if not in cache
|
||||||
debug!("Service '{}' not found in cache, falling back to systemctl", service);
|
|
||||||
let output = Command::new("systemctl")
|
let output = Command::new("systemctl")
|
||||||
.arg("is-active")
|
.args(&["is-active", &format!("{}.service", service)])
|
||||||
.arg(format!("{}.service", service))
|
|
||||||
.output()?;
|
.output()?;
|
||||||
|
|
||||||
let active_status = String::from_utf8(output.stdout)?.trim().to_string();
|
let active_status = String::from_utf8(output.stdout)?.trim().to_string();
|
||||||
|
|
||||||
// Get more detailed info
|
// Get more detailed info
|
||||||
let output = Command::new("systemctl")
|
let output = Command::new("systemctl")
|
||||||
.arg("show")
|
.args(&["show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"])
|
||||||
.arg(format!("{}.service", service))
|
.output()?;
|
||||||
.arg("--property=LoadState,ActiveState,SubState")
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
let detailed_info = String::from_utf8(output.stdout)?;
|
let detailed_info = String::from_utf8(output.stdout)?;
|
||||||
Ok((active_status, detailed_info))
|
Ok((active_status, detailed_info))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate service status, taking user-stopped services into account
|
/// Check if service name matches pattern (supports wildcards like nginx*)
|
||||||
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
||||||
match active_status.to_lowercase().as_str() {
|
if pattern.contains('*') {
|
||||||
"active" => Status::Ok,
|
if pattern.ends_with('*') {
|
||||||
"inactive" | "dead" => {
|
// Pattern like "nginx*" - match if service starts with "nginx"
|
||||||
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
let prefix = &pattern[..pattern.len() - 1];
|
||||||
Status::Inactive
|
service_name.starts_with(prefix)
|
||||||
},
|
} else if pattern.starts_with('*') {
|
||||||
"failed" | "error" => Status::Critical,
|
// Pattern like "*backup" - match if service ends with "backup"
|
||||||
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => {
|
let suffix = &pattern[1..];
|
||||||
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
service_name.ends_with(suffix)
|
||||||
Status::Pending
|
} else {
|
||||||
},
|
// Pattern like "nginx*backup" - simple glob matching
|
||||||
_ => Status::Unknown,
|
self.simple_glob_match(service_name, pattern)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Exact match
|
||||||
|
service_name == pattern
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get service memory usage (if available)
|
/// Simple glob matching for patterns with * in the middle
|
||||||
fn get_service_memory(&self, service: &str) -> Option<f32> {
|
fn simple_glob_match(&self, text: &str, pattern: &str) -> bool {
|
||||||
let output = Command::new("systemctl")
|
let parts: Vec<&str> = pattern.split('*').collect();
|
||||||
.arg("show")
|
let mut pos = 0;
|
||||||
.arg(format!("{}.service", service))
|
|
||||||
.arg("--property=MemoryCurrent")
|
|
||||||
.output()
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
for part in parts {
|
||||||
|
if part.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Some(found_pos) = text[pos..].find(part) {
|
||||||
|
pos += found_pos + part.len();
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get disk usage for a specific service
|
||||||
|
async fn get_service_disk_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||||
|
// Check if this service has configured directory paths
|
||||||
|
if let Some(dirs) = self.config.service_directories.get(service_name) {
|
||||||
|
// Service has configured paths - use the first accessible one
|
||||||
|
for dir in dirs {
|
||||||
|
if let Some(size) = self.get_directory_size(dir) {
|
||||||
|
return Ok(size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If configured paths failed, return 0
|
||||||
|
return Ok(0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No configured path - try to get WorkingDirectory from systemctl
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.args(&["show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: format!("WorkingDirectory for {}", service_name),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
for line in output_str.lines() {
|
for line in output_str.lines() {
|
||||||
if line.starts_with("MemoryCurrent=") {
|
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||||
let memory_str = line.trim_start_matches("MemoryCurrent=");
|
let dir = line.strip_prefix("WorkingDirectory=").unwrap_or("");
|
||||||
if let Ok(memory_bytes) = memory_str.parse::<u64>() {
|
if !dir.is_empty() && dir != "/" {
|
||||||
return Some(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
return Ok(self.get_directory_size(dir).unwrap_or(0.0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
|
||||||
|
Ok(0.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get size of a directory in GB
|
||||||
/// Get directory size in GB with permission-aware logging
|
fn get_directory_size(&self, path: &str) -> Option<f32> {
|
||||||
fn get_directory_size(&self, dir: &str) -> Option<f32> {
|
let output = Command::new("sudo")
|
||||||
let output = Command::new("sudo").arg("du").arg("-sb").arg(dir).output().ok()?;
|
.args(&["du", "-sb", path])
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
if !output.status.success() {
|
if !output.status.success() {
|
||||||
// Log permission errors for debugging but don't spam logs
|
// Log permission errors for debugging but don't spam logs
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
if stderr.contains("Permission denied") {
|
if stderr.contains("Permission denied") {
|
||||||
debug!("Permission denied accessing directory: {}", dir);
|
debug!("Permission denied accessing directory: {}", path);
|
||||||
} else {
|
} else {
|
||||||
debug!("Failed to get size for directory {}: {}", dir, stderr);
|
debug!("Failed to get size for directory {}: {}", path, stderr);
|
||||||
}
|
}
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@@ -422,34 +478,154 @@ impl SystemdCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get service disk usage - simplified and configuration-driven
|
/// Calculate service status, taking user-stopped services into account
|
||||||
fn get_service_disk_usage(&self, service: &str) -> Option<f32> {
|
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||||
// 1. Check if service has configured directories (exact match only)
|
match active_status.to_lowercase().as_str() {
|
||||||
if let Some(dirs) = self.config.service_directories.get(service) {
|
"active" => Status::Ok,
|
||||||
// Service has configured paths - use the first accessible one
|
"inactive" | "dead" => {
|
||||||
for dir in dirs {
|
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
||||||
if let Some(size) = self.get_directory_size(dir) {
|
Status::Inactive
|
||||||
return Some(size);
|
},
|
||||||
|
"failed" | "error" => Status::Critical,
|
||||||
|
"activating" | "deactivating" | "reloading" | "starting" | "stopping" => {
|
||||||
|
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
||||||
|
Status::Pending
|
||||||
|
},
|
||||||
|
_ => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get memory usage for a specific service
|
||||||
|
async fn get_service_memory_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: format!("memory usage for {}", service_name),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.starts_with("MemoryCurrent=") {
|
||||||
|
if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") {
|
||||||
|
if mem_str != "[not set]" {
|
||||||
|
if let Ok(memory_bytes) = mem_str.parse::<u64>() {
|
||||||
|
return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// If configured paths failed, return None (shows as 0)
|
|
||||||
return Some(0.0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. No configured path - use systemctl WorkingDirectory
|
Ok(0.0)
|
||||||
let output = Command::new("systemctl")
|
}
|
||||||
.arg("show")
|
|
||||||
.arg(format!("{}.service", service))
|
/// Check if service collection cache should be updated
|
||||||
.arg("--property=WorkingDirectory")
|
fn should_update_cache(&self) -> bool {
|
||||||
|
let state = self.state.read().unwrap();
|
||||||
|
|
||||||
|
match state.last_collection {
|
||||||
|
None => true,
|
||||||
|
Some(last) => {
|
||||||
|
let cache_duration = std::time::Duration::from_secs(30);
|
||||||
|
last.elapsed() > cache_duration
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get cached complete service data with sub-services if available and fresh
|
||||||
|
fn get_cached_complete_services(&self) -> Option<Vec<ServiceData>> {
|
||||||
|
if !self.should_update_cache() {
|
||||||
|
let state = self.state.read().unwrap();
|
||||||
|
Some(state.cached_service_data.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get nginx sites with latency checks (internal - no caching)
|
||||||
|
fn get_nginx_sites_internal(&self) -> Vec<(String, f32)> {
|
||||||
|
let mut sites = Vec::new();
|
||||||
|
|
||||||
|
// Discover nginx sites from configuration
|
||||||
|
let discovered_sites = self.discover_nginx_sites();
|
||||||
|
|
||||||
|
// Always add all discovered sites, even if checks fail (like old version)
|
||||||
|
for (site_name, url) in &discovered_sites {
|
||||||
|
match self.check_site_latency(url) {
|
||||||
|
Ok(latency_ms) => {
|
||||||
|
sites.push((site_name.clone(), latency_ms));
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Site is unreachable - use -1.0 to indicate error (like old version)
|
||||||
|
sites.push((site_name.clone(), -1.0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sites
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Discover nginx sites from configuration
|
||||||
|
fn discover_nginx_sites(&self) -> Vec<(String, String)> {
|
||||||
|
// Use the same approach as the old working agent: get nginx config from systemd
|
||||||
|
let config_content = match self.get_nginx_config_from_systemd() {
|
||||||
|
Some(content) => content,
|
||||||
|
None => {
|
||||||
|
debug!("Could not get nginx config from systemd, trying nginx -T fallback");
|
||||||
|
match self.get_nginx_config_via_command() {
|
||||||
|
Some(content) => content,
|
||||||
|
None => {
|
||||||
|
debug!("Could not get nginx config via any method");
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse the config content to extract sites
|
||||||
|
self.parse_nginx_config_for_sites(&config_content)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fallback: get nginx config via nginx -T command
|
||||||
|
fn get_nginx_config_via_command(&self) -> Option<String> {
|
||||||
|
let output = Command::new("nginx")
|
||||||
|
.args(&["-T"])
|
||||||
.output()
|
.output()
|
||||||
.ok()?;
|
.ok()?;
|
||||||
|
|
||||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
if !output.status.success() {
|
||||||
for line in output_str.lines() {
|
debug!("nginx -T failed");
|
||||||
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
return None;
|
||||||
let dir = line.trim_start_matches("WorkingDirectory=");
|
}
|
||||||
if !dir.is_empty() && dir != "/" {
|
|
||||||
return self.get_directory_size(dir);
|
Some(String::from_utf8_lossy(&output.stdout).to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get nginx config from systemd service definition (NixOS compatible)
|
||||||
|
fn get_nginx_config_from_systemd(&self) -> Option<String> {
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.args(&["show", "nginx", "--property=ExecStart", "--no-pager"])
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
debug!("Failed to get nginx ExecStart from systemd");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
|
debug!("systemctl show nginx output: {}", stdout);
|
||||||
|
|
||||||
|
// Parse ExecStart to extract -c config path
|
||||||
|
for line in stdout.lines() {
|
||||||
|
if line.starts_with("ExecStart=") {
|
||||||
|
debug!("Found ExecStart line: {}", line);
|
||||||
|
if let Some(config_path) = self.extract_config_path_from_exec_start(line) {
|
||||||
|
debug!("Extracted config path: {}", config_path);
|
||||||
|
return std::fs::read_to_string(&config_path).ok();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -457,194 +633,109 @@ impl SystemdCollector {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract config path from ExecStart line
|
||||||
|
fn extract_config_path_from_exec_start(&self, exec_start: &str) -> Option<String> {
|
||||||
|
// Remove ExecStart= prefix
|
||||||
|
let exec_part = exec_start.strip_prefix("ExecStart=")?;
|
||||||
|
debug!("Parsing exec part: {}", exec_part);
|
||||||
|
|
||||||
|
// Handle NixOS format: ExecStart={ path=...; argv[]=...nginx -c /config; ... }
|
||||||
|
if exec_part.contains("argv[]=") {
|
||||||
|
// Extract the part after argv[]=
|
||||||
|
let argv_start = exec_part.find("argv[]=")?;
|
||||||
|
let argv_part = &exec_part[argv_start + 7..]; // Skip "argv[]="
|
||||||
|
debug!("Found NixOS argv part: {}", argv_part);
|
||||||
|
|
||||||
|
// Look for -c flag followed by config path
|
||||||
|
if let Some(c_pos) = argv_part.find(" -c ") {
|
||||||
|
let after_c = &argv_part[c_pos + 4..];
|
||||||
}
|
// Find the config path (until next space or semicolon)
|
||||||
|
let config_path = after_c.split([' ', ';']).next()?;
|
||||||
#[async_trait]
|
return Some(config_path.to_string());
|
||||||
impl Collector for SystemdCollector {
|
|
||||||
|
|
||||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
|
||||||
let start_time = Instant::now();
|
|
||||||
debug!("Collecting systemd services metrics");
|
|
||||||
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
|
|
||||||
// Get cached services (discovery only happens when needed)
|
|
||||||
let monitored_services = match self.get_monitored_services() {
|
|
||||||
Ok(services) => services,
|
|
||||||
Err(e) => {
|
|
||||||
debug!("Failed to get monitored services: {}", e);
|
|
||||||
return Ok(metrics);
|
|
||||||
}
|
}
|
||||||
};
|
} else {
|
||||||
|
// Handle traditional format: ExecStart=/path/nginx -c /config
|
||||||
// Collect individual metrics for each monitored service (status, memory, disk only)
|
debug!("Parsing traditional format");
|
||||||
for service in &monitored_services {
|
if let Some(c_pos) = exec_part.find(" -c ") {
|
||||||
match self.get_service_status(service) {
|
let after_c = &exec_part[c_pos + 4..];
|
||||||
Ok((active_status, _detailed_info)) => {
|
let config_path = after_c.split_whitespace().next()?;
|
||||||
let status = self.calculate_service_status(service, &active_status);
|
return Some(config_path.to_string());
|
||||||
|
|
||||||
// Individual service status metric
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("service_{}_status", service),
|
|
||||||
value: MetricValue::String(active_status.clone()),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("Service {} status", service)),
|
|
||||||
status,
|
|
||||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Service memory usage (if available)
|
|
||||||
if let Some(memory_mb) = self.get_service_memory(service) {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("service_{}_memory_mb", service),
|
|
||||||
value: MetricValue::Float(memory_mb),
|
|
||||||
unit: Some("MB".to_string()),
|
|
||||||
description: Some(format!("Service {} memory usage", service)),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Service disk usage (comprehensive detection)
|
|
||||||
if let Some(disk_gb) = self.get_service_disk_usage(service) {
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("service_{}_disk_gb", service),
|
|
||||||
value: MetricValue::Float(disk_gb),
|
|
||||||
unit: Some("GB".to_string()),
|
|
||||||
description: Some(format!("Service {} disk usage", service)),
|
|
||||||
status: Status::Ok,
|
|
||||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sub-service metrics for specific services
|
|
||||||
if service.contains("nginx") && active_status == "active" {
|
|
||||||
metrics.extend(self.get_nginx_site_metrics());
|
|
||||||
}
|
|
||||||
|
|
||||||
if service.contains("docker") && active_status == "active" {
|
|
||||||
metrics.extend(self.get_docker_containers());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
debug!("Failed to get status for service {}: {}", service, e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let collection_time = start_time.elapsed();
|
None
|
||||||
debug!(
|
|
||||||
"Systemd collection completed in {:?} with {} individual service metrics",
|
|
||||||
collection_time,
|
|
||||||
metrics.len()
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(metrics)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
/// Parse nginx config content to extract server names and build site list
|
||||||
|
fn parse_nginx_config_for_sites(&self, config_content: &str) -> Vec<(String, String)> {
|
||||||
|
let mut sites = Vec::new();
|
||||||
|
let lines: Vec<&str> = config_content.lines().collect();
|
||||||
|
let mut i = 0;
|
||||||
|
|
||||||
impl SystemdCollector {
|
debug!("Parsing nginx config with {} lines", lines.len());
|
||||||
/// Get nginx sites with latency checks
|
|
||||||
fn get_nginx_sites(&self) -> Vec<Metric> {
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
||||||
|
|
||||||
// Discover nginx sites from configuration
|
while i < lines.len() {
|
||||||
let sites = self.discover_nginx_sites();
|
let line = lines[i].trim();
|
||||||
|
if line.starts_with("server") && line.contains("{") {
|
||||||
for (site_name, url) in &sites {
|
if let Some(server_name) = self.parse_server_block(&lines, &mut i) {
|
||||||
match self.check_site_latency(url) {
|
let url = format!("https://{}", server_name);
|
||||||
Ok(latency_ms) => {
|
sites.push((server_name.clone(), url));
|
||||||
let status = if latency_ms < self.config.nginx_latency_critical_ms {
|
|
||||||
Status::Ok
|
|
||||||
} else {
|
|
||||||
Status::Critical
|
|
||||||
};
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("service_nginx_{}_latency_ms", site_name),
|
|
||||||
value: MetricValue::Float(latency_ms),
|
|
||||||
unit: Some("ms".to_string()),
|
|
||||||
description: Some(format!("Response time for {}", url)),
|
|
||||||
status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Err(_) => {
|
|
||||||
// Site is unreachable
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("service_nginx_{}_latency_ms", site_name),
|
|
||||||
value: MetricValue::Float(-1.0), // Use -1 to indicate error
|
|
||||||
unit: Some("ms".to_string()),
|
|
||||||
description: Some(format!("Response time for {} (unreachable)", url)),
|
|
||||||
status: Status::Critical,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics
|
debug!("Discovered {} nginx sites total", sites.len());
|
||||||
|
sites
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get docker containers as sub-services
|
/// Parse a server block to extract the primary server_name
|
||||||
fn get_docker_containers(&self) -> Vec<Metric> {
|
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
|
||||||
let mut metrics = Vec::new();
|
let mut server_names = Vec::new();
|
||||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
let mut has_redirect = false;
|
||||||
|
let mut i = *start_index + 1;
|
||||||
|
let mut brace_count = 1;
|
||||||
|
|
||||||
// Check if docker is available
|
// Parse until we close the server block
|
||||||
let output = Command::new("docker")
|
while i < lines.len() && brace_count > 0 {
|
||||||
.arg("ps")
|
let trimmed = lines[i].trim();
|
||||||
.arg("--format")
|
|
||||||
.arg("{{.Names}},{{.Status}}")
|
|
||||||
.output();
|
|
||||||
|
|
||||||
let output = match output {
|
// Track braces
|
||||||
Ok(out) if out.status.success() => out,
|
brace_count += trimmed.matches('{').count();
|
||||||
_ => return metrics, // Docker not available or failed
|
brace_count -= trimmed.matches('}').count();
|
||||||
};
|
|
||||||
|
|
||||||
let output_str = match String::from_utf8(output.stdout) {
|
// Extract server_name
|
||||||
Ok(s) => s,
|
if trimmed.starts_with("server_name") {
|
||||||
Err(_) => return metrics,
|
if let Some(names_part) = trimmed.strip_prefix("server_name") {
|
||||||
};
|
let names_clean = names_part.trim().trim_end_matches(';');
|
||||||
|
for name in names_clean.split_whitespace() {
|
||||||
for line in output_str.lines() {
|
if name != "_"
|
||||||
if line.trim().is_empty() {
|
&& !name.is_empty()
|
||||||
continue;
|
&& name.contains('.')
|
||||||
|
&& !name.starts_with('$')
|
||||||
|
{
|
||||||
|
server_names.push(name.to_string());
|
||||||
|
debug!("Found server_name in block: {}", name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let parts: Vec<&str> = line.split(',').collect();
|
// Check for redirects (skip redirect-only servers)
|
||||||
if parts.len() >= 2 {
|
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
|
||||||
let container_name = parts[0].trim();
|
has_redirect = true;
|
||||||
let status_str = parts[1].trim();
|
|
||||||
|
|
||||||
let status = if status_str.contains("Up") {
|
|
||||||
Status::Ok
|
|
||||||
} else if status_str.contains("Exited") {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Critical
|
|
||||||
};
|
|
||||||
|
|
||||||
metrics.push(Metric {
|
|
||||||
name: format!("service_docker_{}_status", container_name),
|
|
||||||
value: MetricValue::String(status_str.to_string()),
|
|
||||||
unit: None,
|
|
||||||
description: Some(format!("Docker container {} status", container_name)),
|
|
||||||
status,
|
|
||||||
timestamp,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics
|
*start_index = i - 1;
|
||||||
|
|
||||||
|
if !server_names.is_empty() && !has_redirect {
|
||||||
|
return Some(server_names[0].clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check site latency using HTTP GET requests
|
/// Check site latency using HTTP GET requests
|
||||||
@@ -678,188 +769,113 @@ impl SystemdCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discover nginx sites from configuration files (like the old working implementation)
|
/// Get docker containers as sub-services
|
||||||
fn discover_nginx_sites(&self) -> Vec<(String, String)> {
|
fn get_docker_containers(&self) -> Vec<(String, String)> {
|
||||||
use tracing::debug;
|
let mut containers = Vec::new();
|
||||||
|
|
||||||
// Use the same approach as the old working agent: get nginx config from systemd
|
// Check if docker is available (cm-agent user is in docker group)
|
||||||
let config_content = match self.get_nginx_config_from_systemd() {
|
// Use -a to show ALL containers (running and stopped)
|
||||||
Some(content) => content,
|
let output = Command::new("docker")
|
||||||
None => {
|
.args(&["ps", "-a", "--format", "{{.Names}},{{.Status}}"])
|
||||||
debug!("Could not get nginx config from systemd, trying nginx -T fallback");
|
.output();
|
||||||
match self.get_nginx_config_via_command() {
|
|
||||||
Some(content) => content,
|
let output = match output {
|
||||||
None => {
|
Ok(out) if out.status.success() => out,
|
||||||
debug!("Could not get nginx config via any method");
|
_ => return containers, // Docker not available or failed
|
||||||
return Vec::new();
|
};
|
||||||
}
|
|
||||||
}
|
let output_str = match String::from_utf8(output.stdout) {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(_) => return containers,
|
||||||
|
};
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parts: Vec<&str> = line.split(',').collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
let container_name = parts[0].trim();
|
||||||
|
let status_str = parts[1].trim();
|
||||||
|
|
||||||
|
let container_status = if status_str.contains("Up") {
|
||||||
|
"active"
|
||||||
|
} else if status_str.contains("Exited") || status_str.contains("Created") {
|
||||||
|
"inactive" // Stopped/created containers are inactive
|
||||||
|
} else {
|
||||||
|
"failed" // Other states (restarting, paused, dead) → failed
|
||||||
|
};
|
||||||
|
|
||||||
|
containers.push((format!("docker_{}", container_name), container_status.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
containers
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get docker images as sub-services
|
||||||
|
fn get_docker_images(&self) -> Vec<(String, String, String)> {
|
||||||
|
let mut images = Vec::new();
|
||||||
|
// Check if docker is available (cm-agent user is in docker group)
|
||||||
|
let output = Command::new("docker")
|
||||||
|
.args(&["images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"])
|
||||||
|
.output();
|
||||||
|
|
||||||
|
let output = match output {
|
||||||
|
Ok(out) if out.status.success() => out,
|
||||||
|
Ok(_) => {
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
return images;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Parse the config content to extract sites
|
let output_str = match String::from_utf8(output.stdout) {
|
||||||
self.parse_nginx_config_for_sites(&config_content)
|
Ok(s) => s,
|
||||||
}
|
Err(_) => return images,
|
||||||
|
};
|
||||||
|
|
||||||
/// Get nginx config from systemd service definition (NixOS compatible)
|
for line in output_str.lines() {
|
||||||
fn get_nginx_config_from_systemd(&self) -> Option<String> {
|
if line.trim().is_empty() {
|
||||||
use tracing::debug;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let output = std::process::Command::new("systemctl")
|
let parts: Vec<&str> = line.split(',').collect();
|
||||||
.args(["show", "nginx", "--property=ExecStart", "--no-pager"])
|
if parts.len() >= 2 {
|
||||||
.output()
|
let image_name = parts[0].trim();
|
||||||
.ok()?;
|
let size = parts[1].trim();
|
||||||
|
|
||||||
if !output.status.success() {
|
// Skip <none>:<none> images (dangling images)
|
||||||
debug!("Failed to get nginx ExecStart from systemd");
|
if image_name.contains("<none>") {
|
||||||
return None;
|
continue;
|
||||||
}
|
|
||||||
|
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
||||||
debug!("systemctl show nginx output: {}", stdout);
|
|
||||||
|
|
||||||
// Parse ExecStart to extract -c config path
|
|
||||||
for line in stdout.lines() {
|
|
||||||
if line.starts_with("ExecStart=") {
|
|
||||||
debug!("Found ExecStart line: {}", line);
|
|
||||||
// Handle both traditional and NixOS systemd formats
|
|
||||||
if let Some(config_path) = self.extract_config_path_from_exec_start(line) {
|
|
||||||
debug!("Extracted config path: {}", config_path);
|
|
||||||
// Read the config file
|
|
||||||
return std::fs::read_to_string(&config_path)
|
|
||||||
.map_err(|e| debug!("Failed to read config file {}: {}", config_path, e))
|
|
||||||
.ok();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
images.push((
|
||||||
|
format!("image_{}", image_name),
|
||||||
|
"active".to_string(), // Images are always "active" (present)
|
||||||
|
size.to_string()
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
None
|
images
|
||||||
}
|
}
|
||||||
|
}
|
||||||
/// Extract config path from ExecStart line
|
|
||||||
fn extract_config_path_from_exec_start(&self, exec_start: &str) -> Option<String> {
|
#[async_trait]
|
||||||
use tracing::debug;
|
impl Collector for SystemdCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
// Remove ExecStart= prefix
|
// Use cached complete data if available and fresh
|
||||||
let exec_part = exec_start.strip_prefix("ExecStart=")?;
|
if let Some(cached_complete_services) = self.get_cached_complete_services() {
|
||||||
debug!("Parsing exec part: {}", exec_part);
|
for service_data in cached_complete_services {
|
||||||
|
agent_data.services.push(service_data);
|
||||||
// Handle NixOS format: ExecStart={ path=...; argv[]=...nginx -c /config; ... }
|
}
|
||||||
if exec_part.contains("argv[]=") {
|
Ok(())
|
||||||
// Extract the part after argv[]=
|
} else {
|
||||||
let argv_start = exec_part.find("argv[]=")?;
|
// Collect fresh data
|
||||||
let argv_part = &exec_part[argv_start + 7..]; // Skip "argv[]="
|
self.collect_service_data(agent_data).await
|
||||||
debug!("Found NixOS argv part: {}", argv_part);
|
}
|
||||||
|
|
||||||
// Look for -c flag followed by config path
|
|
||||||
if let Some(c_pos) = argv_part.find(" -c ") {
|
|
||||||
let after_c = &argv_part[c_pos + 4..];
|
|
||||||
// Find the config path (until next space or semicolon)
|
|
||||||
let config_path = after_c.split([' ', ';']).next()?;
|
|
||||||
return Some(config_path.to_string());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Handle traditional format: ExecStart=/path/nginx -c /config
|
|
||||||
debug!("Parsing traditional format");
|
|
||||||
if let Some(c_pos) = exec_part.find(" -c ") {
|
|
||||||
let after_c = &exec_part[c_pos + 4..];
|
|
||||||
let config_path = after_c.split_whitespace().next()?;
|
|
||||||
return Some(config_path.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fallback: get nginx config via nginx -T command
|
|
||||||
fn get_nginx_config_via_command(&self) -> Option<String> {
|
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
let output = std::process::Command::new("nginx")
|
|
||||||
.args(["-T"])
|
|
||||||
.output()
|
|
||||||
.ok()?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
debug!("nginx -T failed");
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(String::from_utf8_lossy(&output.stdout).to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse nginx config content to extract server names and build site list
|
|
||||||
fn parse_nginx_config_for_sites(&self, config_content: &str) -> Vec<(String, String)> {
|
|
||||||
use tracing::debug;
|
|
||||||
let mut sites = Vec::new();
|
|
||||||
let lines: Vec<&str> = config_content.lines().collect();
|
|
||||||
let mut i = 0;
|
|
||||||
|
|
||||||
debug!("Parsing nginx config with {} lines", lines.len());
|
|
||||||
|
|
||||||
while i < lines.len() {
|
|
||||||
let line = lines[i].trim();
|
|
||||||
if line.starts_with("server") && line.contains("{") {
|
|
||||||
if let Some(server_name) = self.parse_server_block(&lines, &mut i) {
|
|
||||||
let url = format!("https://{}", server_name);
|
|
||||||
sites.push((server_name.clone(), url));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("Discovered {} nginx sites total", sites.len());
|
|
||||||
sites
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse a server block to extract the primary server_name
|
|
||||||
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
|
|
||||||
use tracing::debug;
|
|
||||||
let mut server_names = Vec::new();
|
|
||||||
let mut has_redirect = false;
|
|
||||||
let mut i = *start_index + 1;
|
|
||||||
let mut brace_count = 1;
|
|
||||||
|
|
||||||
// Parse until we close the server block
|
|
||||||
while i < lines.len() && brace_count > 0 {
|
|
||||||
let trimmed = lines[i].trim();
|
|
||||||
|
|
||||||
// Track braces
|
|
||||||
brace_count += trimmed.matches('{').count();
|
|
||||||
brace_count -= trimmed.matches('}').count();
|
|
||||||
|
|
||||||
// Extract server_name
|
|
||||||
if trimmed.starts_with("server_name") {
|
|
||||||
if let Some(names_part) = trimmed.strip_prefix("server_name") {
|
|
||||||
let names_clean = names_part.trim().trim_end_matches(';');
|
|
||||||
for name in names_clean.split_whitespace() {
|
|
||||||
if name != "_"
|
|
||||||
&& !name.is_empty()
|
|
||||||
&& name.contains('.')
|
|
||||||
&& !name.starts_with('$')
|
|
||||||
{
|
|
||||||
server_names.push(name.to_string());
|
|
||||||
debug!("Found server_name in block: {}", name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Check for redirects (skip redirect-only servers)
|
|
||||||
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
|
|
||||||
has_redirect = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
*start_index = i - 1;
|
|
||||||
|
|
||||||
if !server_names.is_empty() && !has_redirect {
|
|
||||||
return Some(server_names[0].clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
403
agent/src/collectors/systemd_old.rs
Normal file
403
agent/src/collectors/systemd_old.rs
Normal file
@@ -0,0 +1,403 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use cm_dashboard_shared::{AgentData, ServiceData, Status};
|
||||||
|
use std::process::Command;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
use std::time::Instant;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use super::{Collector, CollectorError};
|
||||||
|
use crate::config::SystemdConfig;
|
||||||
|
|
||||||
|
/// Systemd collector for monitoring systemd services with structured data output
|
||||||
|
pub struct SystemdCollector {
|
||||||
|
/// Cached state with thread-safe interior mutability
|
||||||
|
state: RwLock<ServiceCacheState>,
|
||||||
|
/// Configuration for service monitoring
|
||||||
|
config: SystemdConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal state for service caching
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ServiceCacheState {
|
||||||
|
/// Last collection time for performance tracking
|
||||||
|
last_collection: Option<Instant>,
|
||||||
|
/// Cached service data
|
||||||
|
services: Vec<ServiceInfo>,
|
||||||
|
/// Interesting services to monitor (cached after discovery)
|
||||||
|
monitored_services: Vec<String>,
|
||||||
|
/// Cached service status information from discovery
|
||||||
|
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||||
|
/// Last time services were discovered
|
||||||
|
last_discovery_time: Option<Instant>,
|
||||||
|
/// How often to rediscover services (from config)
|
||||||
|
discovery_interval_seconds: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cached service status information from systemctl list-units
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ServiceStatusInfo {
|
||||||
|
load_state: String,
|
||||||
|
active_state: String,
|
||||||
|
sub_state: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal service information
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ServiceInfo {
|
||||||
|
name: String,
|
||||||
|
status: String, // "active", "inactive", "failed", etc.
|
||||||
|
memory_mb: f32, // Memory usage in MB
|
||||||
|
disk_gb: f32, // Disk usage in GB (usually 0 for services)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SystemdCollector {
|
||||||
|
pub fn new(config: SystemdConfig) -> Self {
|
||||||
|
let state = ServiceCacheState {
|
||||||
|
last_collection: None,
|
||||||
|
services: Vec::new(),
|
||||||
|
monitored_services: Vec::new(),
|
||||||
|
service_status_cache: std::collections::HashMap::new(),
|
||||||
|
last_discovery_time: None,
|
||||||
|
discovery_interval_seconds: config.interval_seconds,
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
state: RwLock::new(state),
|
||||||
|
config,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect service data and populate AgentData
|
||||||
|
async fn collect_service_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
let start_time = Instant::now();
|
||||||
|
debug!("Collecting systemd services metrics");
|
||||||
|
|
||||||
|
// Get cached services (discovery only happens when needed)
|
||||||
|
let monitored_services = match self.get_monitored_services() {
|
||||||
|
Ok(services) => services,
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get monitored services: {}", e);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Collect service data for each monitored service
|
||||||
|
let mut services = Vec::new();
|
||||||
|
for service_name in &monitored_services {
|
||||||
|
match self.get_service_status(service_name) {
|
||||||
|
Ok((active_status, _detailed_info)) => {
|
||||||
|
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||||
|
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
||||||
|
|
||||||
|
let service_info = ServiceInfo {
|
||||||
|
name: service_name.clone(),
|
||||||
|
status: active_status,
|
||||||
|
memory_mb,
|
||||||
|
disk_gb,
|
||||||
|
};
|
||||||
|
services.push(service_info);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Failed to get status for service {}: {}", service_name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update cached state
|
||||||
|
{
|
||||||
|
let mut state = self.state.write().unwrap();
|
||||||
|
state.last_collection = Some(start_time);
|
||||||
|
state.services = services.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Populate AgentData with service information
|
||||||
|
for service in services {
|
||||||
|
agent_data.services.push(ServiceData {
|
||||||
|
name: service.name.clone(),
|
||||||
|
status: service.status.clone(),
|
||||||
|
memory_mb: service.memory_mb,
|
||||||
|
disk_gb: service.disk_gb,
|
||||||
|
user_stopped: false, // TODO: Integrate with service tracker
|
||||||
|
service_status: self.calculate_service_status(&service.name, &service.status),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let elapsed = start_time.elapsed();
|
||||||
|
debug!("Systemd collection completed in {:?} with {} services", elapsed, agent_data.services.len());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get systemd services information
|
||||||
|
async fn get_systemd_services(&self) -> Result<Vec<ServiceInfo>, CollectorError> {
|
||||||
|
let mut services = Vec::new();
|
||||||
|
|
||||||
|
// Get ALL service unit files (includes inactive services)
|
||||||
|
let unit_files_output = Command::new("systemctl")
|
||||||
|
.args(&["list-unit-files", "--type=service", "--no-pager", "--plain"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: "systemctl list-unit-files".to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Get runtime status of ALL units (including inactive)
|
||||||
|
let status_output = Command::new("systemctl")
|
||||||
|
.args(&["list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: "systemctl list-units --all".to_string(),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let unit_files_str = String::from_utf8_lossy(&unit_files_output.stdout);
|
||||||
|
let status_str = String::from_utf8_lossy(&status_output.stdout);
|
||||||
|
|
||||||
|
// Parse all service unit files to get complete service list
|
||||||
|
let mut all_service_names = std::collections::HashSet::new();
|
||||||
|
for line in unit_files_str.lines() {
|
||||||
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||||
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
|
all_service_names.insert(service_name.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse runtime status for all units
|
||||||
|
let mut status_cache = std::collections::HashMap::new();
|
||||||
|
for line in status_str.lines() {
|
||||||
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||||
|
let service_name = fields[0].trim_end_matches(".service");
|
||||||
|
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||||
|
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||||
|
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||||
|
status_cache.insert(service_name.to_string(), (load_state, active_state, sub_state));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For services found in unit files but not in runtime status, set default inactive status
|
||||||
|
for service_name in &all_service_names {
|
||||||
|
if !status_cache.contains_key(service_name) {
|
||||||
|
status_cache.insert(service_name.to_string(), (
|
||||||
|
"not-loaded".to_string(),
|
||||||
|
"inactive".to_string(),
|
||||||
|
"dead".to_string()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process all discovered services and apply filters
|
||||||
|
for service_name in &all_service_names {
|
||||||
|
if self.should_monitor_service(service_name) {
|
||||||
|
if let Some((load_state, active_state, sub_state)) = status_cache.get(service_name) {
|
||||||
|
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||||
|
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
||||||
|
|
||||||
|
let normalized_status = self.normalize_service_status(active_state, sub_state);
|
||||||
|
let service_info = ServiceInfo {
|
||||||
|
name: service_name.to_string(),
|
||||||
|
status: normalized_status,
|
||||||
|
memory_mb,
|
||||||
|
disk_gb,
|
||||||
|
};
|
||||||
|
|
||||||
|
services.push(service_info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(services)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a service should be monitored based on configuration filters with wildcard support
|
||||||
|
fn should_monitor_service(&self, service_name: &str) -> bool {
|
||||||
|
// If no filters configured, monitor nothing (to prevent noise)
|
||||||
|
if self.config.service_name_filters.is_empty() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if service matches any of the configured patterns
|
||||||
|
for pattern in &self.config.service_name_filters {
|
||||||
|
if self.matches_pattern(service_name, pattern) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if service name matches pattern (supports wildcards like nginx*)
|
||||||
|
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
||||||
|
if pattern.ends_with('*') {
|
||||||
|
let prefix = &pattern[..pattern.len() - 1];
|
||||||
|
service_name.starts_with(prefix)
|
||||||
|
} else {
|
||||||
|
service_name == pattern
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get disk usage for a specific service
|
||||||
|
async fn get_service_disk_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||||
|
// Check if this service has configured directory paths
|
||||||
|
if let Some(dirs) = self.config.service_directories.get(service_name) {
|
||||||
|
// Service has configured paths - use the first accessible one
|
||||||
|
for dir in dirs {
|
||||||
|
if let Some(size) = self.get_directory_size(dir) {
|
||||||
|
return Ok(size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If configured paths failed, return 0
|
||||||
|
return Ok(0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No configured path - try to get WorkingDirectory from systemctl
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.args(&["show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: format!("WorkingDirectory for {}", service_name),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||||
|
let dir = line.strip_prefix("WorkingDirectory=").unwrap_or("");
|
||||||
|
if !dir.is_empty() {
|
||||||
|
return Ok(self.get_directory_size(dir).unwrap_or(0.0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(0.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get size of a directory in GB
|
||||||
|
fn get_directory_size(&self, path: &str) -> Option<f32> {
|
||||||
|
let output = Command::new("du")
|
||||||
|
.args(&["-sb", path])
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
let parts: Vec<&str> = output_str.split_whitespace().collect();
|
||||||
|
if let Some(size_str) = parts.first() {
|
||||||
|
if let Ok(size_bytes) = size_str.parse::<u64>() {
|
||||||
|
return Some(size_bytes as f32 / (1024.0 * 1024.0 * 1024.0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate service status, taking user-stopped services into account
|
||||||
|
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||||
|
match active_status.to_lowercase().as_str() {
|
||||||
|
"active" => Status::Ok,
|
||||||
|
"inactive" | "dead" => {
|
||||||
|
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
||||||
|
Status::Inactive
|
||||||
|
},
|
||||||
|
"failed" | "error" => Status::Critical,
|
||||||
|
"activating" | "deactivating" | "reloading" | "starting" | "stopping" => {
|
||||||
|
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
||||||
|
Status::Pending
|
||||||
|
},
|
||||||
|
_ => Status::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get memory usage for a specific service
|
||||||
|
async fn get_service_memory_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||||
|
let output = Command::new("systemctl")
|
||||||
|
.args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"])
|
||||||
|
.output()
|
||||||
|
.map_err(|e| CollectorError::SystemRead {
|
||||||
|
path: format!("memory usage for {}", service_name),
|
||||||
|
error: e.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||||
|
|
||||||
|
for line in output_str.lines() {
|
||||||
|
if line.starts_with("MemoryCurrent=") {
|
||||||
|
if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") {
|
||||||
|
if mem_str != "[not set]" {
|
||||||
|
if let Ok(memory_bytes) = mem_str.parse::<u64>() {
|
||||||
|
return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(0.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize service status to standard values
|
||||||
|
fn normalize_service_status(&self, active_state: &str, sub_state: &str) -> String {
|
||||||
|
match (active_state, sub_state) {
|
||||||
|
("active", "running") => "active".to_string(),
|
||||||
|
("active", _) => "active".to_string(),
|
||||||
|
("inactive", "dead") => "inactive".to_string(),
|
||||||
|
("inactive", _) => "inactive".to_string(),
|
||||||
|
("failed", _) => "failed".to_string(),
|
||||||
|
("activating", _) => "starting".to_string(),
|
||||||
|
("deactivating", _) => "stopping".to_string(),
|
||||||
|
_ => format!("{}:{}", active_state, sub_state),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if service collection cache should be updated
|
||||||
|
fn should_update_cache(&self) -> bool {
|
||||||
|
let state = self.state.read().unwrap();
|
||||||
|
|
||||||
|
match state.last_collection {
|
||||||
|
None => true,
|
||||||
|
Some(last) => {
|
||||||
|
let cache_duration = std::time::Duration::from_secs(30);
|
||||||
|
last.elapsed() > cache_duration
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get cached service data if available and fresh
|
||||||
|
fn get_cached_services(&self) -> Option<Vec<ServiceInfo>> {
|
||||||
|
if !self.should_update_cache() {
|
||||||
|
let state = self.state.read().unwrap();
|
||||||
|
Some(state.services.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Collector for SystemdCollector {
|
||||||
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||||
|
// Use cached data if available and fresh
|
||||||
|
if let Some(cached_services) = self.get_cached_services() {
|
||||||
|
debug!("Using cached systemd services data");
|
||||||
|
for service in cached_services {
|
||||||
|
agent_data.services.push(ServiceData {
|
||||||
|
name: service.name.clone(),
|
||||||
|
status: service.status.clone(),
|
||||||
|
memory_mb: service.memory_mb,
|
||||||
|
disk_gb: service.disk_gb,
|
||||||
|
user_stopped: false, // TODO: Integrate with service tracker
|
||||||
|
service_status: self.calculate_service_status(&service.name, &service.status),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
// Collect fresh data
|
||||||
|
self.collect_service_data(agent_data).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
|
use cm_dashboard_shared::{AgentData, MessageEnvelope};
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
use zmq::{Context, Socket, SocketType};
|
use zmq::{Context, Socket, SocketType};
|
||||||
|
|
||||||
@@ -43,17 +43,17 @@ impl ZmqHandler {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Publish metrics message via ZMQ
|
|
||||||
pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
|
/// Publish agent data via ZMQ
|
||||||
|
pub async fn publish_agent_data(&self, data: &AgentData) -> Result<()> {
|
||||||
debug!(
|
debug!(
|
||||||
"Publishing {} metrics for host {}",
|
"Publishing agent data for host {}",
|
||||||
message.metrics.len(),
|
data.hostname
|
||||||
message.hostname
|
|
||||||
);
|
);
|
||||||
|
|
||||||
// Create message envelope
|
// Create message envelope for agent data
|
||||||
let envelope = MessageEnvelope::metrics(message.clone())
|
let envelope = MessageEnvelope::agent_data(data.clone())
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to create message envelope: {}", e))?;
|
.map_err(|e| anyhow::anyhow!("Failed to create agent data envelope: {}", e))?;
|
||||||
|
|
||||||
// Serialize envelope
|
// Serialize envelope
|
||||||
let serialized = serde_json::to_vec(&envelope)?;
|
let serialized = serde_json::to_vec(&envelope)?;
|
||||||
@@ -61,11 +61,10 @@ impl ZmqHandler {
|
|||||||
// Send via ZMQ
|
// Send via ZMQ
|
||||||
self.publisher.send(&serialized, 0)?;
|
self.publisher.send(&serialized, 0)?;
|
||||||
|
|
||||||
debug!("Published metrics message ({} bytes)", serialized.len());
|
debug!("Published agent data message ({} bytes)", serialized.len());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Try to receive a command (non-blocking)
|
/// Try to receive a command (non-blocking)
|
||||||
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
||||||
match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
|
match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
|
||||||
|
|||||||
@@ -1,2 +0,0 @@
|
|||||||
// This file is now empty - all configuration values come from config files
|
|
||||||
// No hardcoded defaults are used
|
|
||||||
@@ -6,8 +6,6 @@ use std::path::Path;
|
|||||||
pub mod loader;
|
pub mod loader;
|
||||||
pub mod validation;
|
pub mod validation;
|
||||||
|
|
||||||
use crate::status::HostStatusConfig;
|
|
||||||
|
|
||||||
/// Main agent configuration
|
/// Main agent configuration
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct AgentConfig {
|
pub struct AgentConfig {
|
||||||
@@ -15,7 +13,6 @@ pub struct AgentConfig {
|
|||||||
pub collectors: CollectorConfig,
|
pub collectors: CollectorConfig,
|
||||||
pub cache: CacheConfig,
|
pub cache: CacheConfig,
|
||||||
pub notifications: NotificationConfig,
|
pub notifications: NotificationConfig,
|
||||||
pub status_aggregation: HostStatusConfig,
|
|
||||||
pub collection_interval_seconds: u64,
|
pub collection_interval_seconds: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -74,7 +71,8 @@ pub struct DiskConfig {
|
|||||||
pub usage_warning_percent: f32,
|
pub usage_warning_percent: f32,
|
||||||
/// Disk usage critical threshold (percentage)
|
/// Disk usage critical threshold (percentage)
|
||||||
pub usage_critical_percent: f32,
|
pub usage_critical_percent: f32,
|
||||||
/// Filesystem configurations
|
/// Filesystem configurations (optional - auto-discovery used if empty)
|
||||||
|
#[serde(default)]
|
||||||
pub filesystems: Vec<FilesystemConfig>,
|
pub filesystems: Vec<FilesystemConfig>,
|
||||||
/// SMART monitoring thresholds
|
/// SMART monitoring thresholds
|
||||||
pub temperature_warning_celsius: f32,
|
pub temperature_warning_celsius: f32,
|
||||||
|
|||||||
@@ -7,10 +7,7 @@ mod agent;
|
|||||||
mod collectors;
|
mod collectors;
|
||||||
mod communication;
|
mod communication;
|
||||||
mod config;
|
mod config;
|
||||||
mod metrics;
|
|
||||||
mod notifications;
|
mod notifications;
|
||||||
mod service_tracker;
|
|
||||||
mod status;
|
|
||||||
|
|
||||||
use agent::Agent;
|
use agent::Agent;
|
||||||
|
|
||||||
|
|||||||
@@ -232,6 +232,8 @@ impl MetricCollectionManager {
|
|||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Collector {} failed: {}", timed_collector.name, e);
|
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||||
|
// Update last_collection time even on failure to prevent immediate retries
|
||||||
|
timed_collector.last_collection = Some(now);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,164 +0,0 @@
|
|||||||
use anyhow::Result;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::fs;
|
|
||||||
use std::path::Path;
|
|
||||||
use std::sync::{Arc, Mutex, OnceLock};
|
|
||||||
use tracing::{debug, info, warn};
|
|
||||||
|
|
||||||
/// Shared instance for global access
|
|
||||||
static GLOBAL_TRACKER: OnceLock<Arc<Mutex<UserStoppedServiceTracker>>> = OnceLock::new();
|
|
||||||
|
|
||||||
/// Tracks services that have been stopped by user action
|
|
||||||
/// These services should be treated as OK status instead of Warning
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct UserStoppedServiceTracker {
|
|
||||||
/// Set of services stopped by user action
|
|
||||||
user_stopped_services: HashSet<String>,
|
|
||||||
/// Path to persistent storage file
|
|
||||||
storage_path: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Serializable data structure for persistence
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
struct UserStoppedData {
|
|
||||||
services: Vec<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UserStoppedServiceTracker {
|
|
||||||
/// Create new tracker with default storage path
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self::with_storage_path("/var/lib/cm-dashboard/user-stopped-services.json")
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Initialize global instance (called by agent)
|
|
||||||
pub fn init_global() -> Result<Self> {
|
|
||||||
let tracker = Self::new();
|
|
||||||
|
|
||||||
// Set global instance
|
|
||||||
let global_instance = Arc::new(Mutex::new(tracker));
|
|
||||||
if GLOBAL_TRACKER.set(global_instance).is_err() {
|
|
||||||
warn!("Global service tracker was already initialized");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return a new instance for the agent to use
|
|
||||||
Ok(Self::new())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if a service is user-stopped (global access for collectors)
|
|
||||||
pub fn is_service_user_stopped(service_name: &str) -> bool {
|
|
||||||
if let Some(global) = GLOBAL_TRACKER.get() {
|
|
||||||
if let Ok(tracker) = global.lock() {
|
|
||||||
tracker.is_user_stopped(service_name)
|
|
||||||
} else {
|
|
||||||
debug!("Failed to lock global service tracker");
|
|
||||||
false
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
debug!("Global service tracker not initialized");
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Update global tracker (called by agent when tracker state changes)
|
|
||||||
pub fn update_global(updated_tracker: &UserStoppedServiceTracker) {
|
|
||||||
if let Some(global) = GLOBAL_TRACKER.get() {
|
|
||||||
if let Ok(mut tracker) = global.lock() {
|
|
||||||
tracker.user_stopped_services = updated_tracker.user_stopped_services.clone();
|
|
||||||
} else {
|
|
||||||
debug!("Failed to lock global service tracker for update");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
debug!("Global service tracker not initialized for update");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create new tracker with custom storage path
|
|
||||||
pub fn with_storage_path<P: AsRef<Path>>(storage_path: P) -> Self {
|
|
||||||
let storage_path = storage_path.as_ref().to_string_lossy().to_string();
|
|
||||||
let mut tracker = Self {
|
|
||||||
user_stopped_services: HashSet::new(),
|
|
||||||
storage_path,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Load existing data from storage
|
|
||||||
if let Err(e) = tracker.load_from_storage() {
|
|
||||||
warn!("Failed to load user-stopped services from storage: {}", e);
|
|
||||||
info!("Starting with empty user-stopped services list");
|
|
||||||
}
|
|
||||||
|
|
||||||
tracker
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Clear user-stopped flag for a service (when user starts it)
|
|
||||||
pub fn clear_user_stopped(&mut self, service_name: &str) -> Result<()> {
|
|
||||||
if self.user_stopped_services.remove(service_name) {
|
|
||||||
info!("Cleared user-stopped flag for service '{}'", service_name);
|
|
||||||
self.save_to_storage()?;
|
|
||||||
debug!("Service '{}' user-stopped flag cleared and saved to storage", service_name);
|
|
||||||
} else {
|
|
||||||
debug!("Service '{}' was not marked as user-stopped", service_name);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if a service is marked as user-stopped
|
|
||||||
pub fn is_user_stopped(&self, service_name: &str) -> bool {
|
|
||||||
let is_stopped = self.user_stopped_services.contains(service_name);
|
|
||||||
debug!("Service '{}' user-stopped status: {}", service_name, is_stopped);
|
|
||||||
is_stopped
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Save current state to persistent storage
|
|
||||||
fn save_to_storage(&self) -> Result<()> {
|
|
||||||
// Create parent directory if it doesn't exist
|
|
||||||
if let Some(parent_dir) = Path::new(&self.storage_path).parent() {
|
|
||||||
if !parent_dir.exists() {
|
|
||||||
fs::create_dir_all(parent_dir)?;
|
|
||||||
debug!("Created parent directory: {}", parent_dir.display());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let data = UserStoppedData {
|
|
||||||
services: self.user_stopped_services.iter().cloned().collect(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let json_data = serde_json::to_string_pretty(&data)?;
|
|
||||||
fs::write(&self.storage_path, json_data)?;
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
"Saved {} user-stopped services to {}",
|
|
||||||
data.services.len(),
|
|
||||||
self.storage_path
|
|
||||||
);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load state from persistent storage
|
|
||||||
fn load_from_storage(&mut self) -> Result<()> {
|
|
||||||
if !Path::new(&self.storage_path).exists() {
|
|
||||||
debug!("Storage file {} does not exist, starting fresh", self.storage_path);
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
let json_data = fs::read_to_string(&self.storage_path)?;
|
|
||||||
let data: UserStoppedData = serde_json::from_str(&json_data)?;
|
|
||||||
|
|
||||||
self.user_stopped_services = data.services.into_iter().collect();
|
|
||||||
|
|
||||||
info!(
|
|
||||||
"Loaded {} user-stopped services from {}",
|
|
||||||
self.user_stopped_services.len(),
|
|
||||||
self.storage_path
|
|
||||||
);
|
|
||||||
|
|
||||||
if !self.user_stopped_services.is_empty() {
|
|
||||||
debug!("User-stopped services: {:?}", self.user_stopped_services);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cm-dashboard"
|
name = "cm-dashboard"
|
||||||
version = "0.1.118"
|
version = "0.1.181"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|||||||
@@ -183,30 +183,28 @@ impl Dashboard {
|
|||||||
|
|
||||||
// Check for new metrics
|
// Check for new metrics
|
||||||
if last_metrics_check.elapsed() >= metrics_check_interval {
|
if last_metrics_check.elapsed() >= metrics_check_interval {
|
||||||
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
|
if let Ok(Some(agent_data)) = self.zmq_consumer.receive_agent_data().await {
|
||||||
debug!(
|
debug!(
|
||||||
"Received metrics from {}: {} metrics",
|
"Received agent data from {}",
|
||||||
metric_message.hostname,
|
agent_data.hostname
|
||||||
metric_message.metrics.len()
|
|
||||||
);
|
);
|
||||||
|
|
||||||
// Track first contact with host (no command needed - agent sends data every 2s)
|
// Track first contact with host (no command needed - agent sends data every 2s)
|
||||||
let is_new_host = !self
|
let is_new_host = !self
|
||||||
.initial_commands_sent
|
.initial_commands_sent
|
||||||
.contains(&metric_message.hostname);
|
.contains(&agent_data.hostname);
|
||||||
|
|
||||||
if is_new_host {
|
if is_new_host {
|
||||||
info!(
|
info!(
|
||||||
"First contact with host {} - data will update automatically",
|
"First contact with host {} - data will update automatically",
|
||||||
metric_message.hostname
|
agent_data.hostname
|
||||||
);
|
);
|
||||||
self.initial_commands_sent
|
self.initial_commands_sent
|
||||||
.insert(metric_message.hostname.clone());
|
.insert(agent_data.hostname.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update metric store
|
// Store structured data directly
|
||||||
self.metric_store
|
self.metric_store.store_agent_data(agent_data);
|
||||||
.update_metrics(&metric_message.hostname, metric_message.metrics);
|
|
||||||
|
|
||||||
// Check for agent version mismatches across hosts
|
// Check for agent version mismatches across hosts
|
||||||
if let Some((current_version, outdated_hosts)) = self.metric_store.get_version_mismatches() {
|
if let Some((current_version, outdated_hosts)) = self.metric_store.get_version_mismatches() {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cm_dashboard_shared::{CommandOutputMessage, MessageEnvelope, MessageType, MetricMessage};
|
use cm_dashboard_shared::{AgentData, CommandOutputMessage, MessageEnvelope, MessageType};
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
use zmq::{Context, Socket, SocketType};
|
use zmq::{Context, Socket, SocketType};
|
||||||
|
|
||||||
@@ -117,8 +117,8 @@ impl ZmqConsumer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Receive metrics from any connected agent (non-blocking)
|
/// Receive agent data (non-blocking)
|
||||||
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
pub async fn receive_agent_data(&mut self) -> Result<Option<AgentData>> {
|
||||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||||
Ok(data) => {
|
Ok(data) => {
|
||||||
debug!("Received {} bytes from ZMQ", data.len());
|
debug!("Received {} bytes from ZMQ", data.len());
|
||||||
@@ -129,29 +129,27 @@ impl ZmqConsumer {
|
|||||||
|
|
||||||
// Check message type
|
// Check message type
|
||||||
match envelope.message_type {
|
match envelope.message_type {
|
||||||
MessageType::Metrics => {
|
MessageType::AgentData => {
|
||||||
let metrics = envelope
|
let agent_data = envelope
|
||||||
.decode_metrics()
|
.decode_agent_data()
|
||||||
.map_err(|e| anyhow::anyhow!("Failed to decode metrics: {}", e))?;
|
.map_err(|e| anyhow::anyhow!("Failed to decode agent data: {}", e))?;
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
"Received {} metrics from {}",
|
"Received agent data from host {}",
|
||||||
metrics.metrics.len(),
|
agent_data.hostname
|
||||||
metrics.hostname
|
|
||||||
);
|
);
|
||||||
|
Ok(Some(agent_data))
|
||||||
Ok(Some(metrics))
|
|
||||||
}
|
}
|
||||||
MessageType::Heartbeat => {
|
MessageType::Heartbeat => {
|
||||||
debug!("Received heartbeat");
|
debug!("Received heartbeat");
|
||||||
Ok(None) // Don't return heartbeats as metrics
|
Ok(None) // Don't return heartbeats
|
||||||
}
|
}
|
||||||
MessageType::CommandOutput => {
|
MessageType::CommandOutput => {
|
||||||
debug!("Received command output (will be handled by receive_command_output)");
|
debug!("Received command output (will be handled by receive_command_output)");
|
||||||
Ok(None) // Command output handled by separate method
|
Ok(None) // Command output handled by separate method
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
debug!("Received non-metrics message: {:?}", envelope.message_type);
|
debug!("Received unsupported message: {:?}", envelope.message_type);
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -166,5 +164,6 @@ impl ZmqConsumer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use cm_dashboard_shared::Metric;
|
use cm_dashboard_shared::AgentData;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use tracing::{debug, info, warn};
|
use tracing::{debug, info, warn};
|
||||||
@@ -7,8 +7,8 @@ use super::MetricDataPoint;
|
|||||||
|
|
||||||
/// Central metric storage for the dashboard
|
/// Central metric storage for the dashboard
|
||||||
pub struct MetricStore {
|
pub struct MetricStore {
|
||||||
/// Current metrics: hostname -> metric_name -> metric
|
/// Current structured data: hostname -> AgentData
|
||||||
current_metrics: HashMap<String, HashMap<String, Metric>>,
|
current_agent_data: HashMap<String, AgentData>,
|
||||||
/// Historical metrics for trending
|
/// Historical metrics for trending
|
||||||
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
|
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
|
||||||
/// Last heartbeat timestamp per host
|
/// Last heartbeat timestamp per host
|
||||||
@@ -21,7 +21,7 @@ pub struct MetricStore {
|
|||||||
impl MetricStore {
|
impl MetricStore {
|
||||||
pub fn new(max_metrics_per_host: usize, history_retention_hours: u64) -> Self {
|
pub fn new(max_metrics_per_host: usize, history_retention_hours: u64) -> Self {
|
||||||
Self {
|
Self {
|
||||||
current_metrics: HashMap::new(),
|
current_agent_data: HashMap::new(),
|
||||||
historical_metrics: HashMap::new(),
|
historical_metrics: HashMap::new(),
|
||||||
last_heartbeat: HashMap::new(),
|
last_heartbeat: HashMap::new(),
|
||||||
max_metrics_per_host,
|
max_metrics_per_host,
|
||||||
@@ -29,68 +29,43 @@ impl MetricStore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update metrics for a specific host
|
|
||||||
pub fn update_metrics(&mut self, hostname: &str, metrics: Vec<Metric>) {
|
/// Store structured agent data directly
|
||||||
|
pub fn store_agent_data(&mut self, agent_data: AgentData) {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
let hostname = agent_data.hostname.clone();
|
||||||
|
|
||||||
debug!("Updating {} metrics for host {}", metrics.len(), hostname);
|
debug!("Storing structured data for host {}", hostname);
|
||||||
|
|
||||||
// Get or create host entry
|
// Store the structured data directly
|
||||||
let host_metrics = self
|
self.current_agent_data.insert(hostname.clone(), agent_data);
|
||||||
.current_metrics
|
|
||||||
.entry(hostname.to_string())
|
|
||||||
.or_insert_with(HashMap::new);
|
|
||||||
|
|
||||||
// Get or create historical entry
|
// Update heartbeat timestamp
|
||||||
|
self.last_heartbeat.insert(hostname.clone(), now);
|
||||||
|
debug!("Updated heartbeat for host {}", hostname);
|
||||||
|
|
||||||
|
// Add to history
|
||||||
let host_history = self
|
let host_history = self
|
||||||
.historical_metrics
|
.historical_metrics
|
||||||
.entry(hostname.to_string())
|
.entry(hostname.clone())
|
||||||
.or_insert_with(Vec::new);
|
.or_insert_with(Vec::new);
|
||||||
|
host_history.push(MetricDataPoint { received_at: now });
|
||||||
|
|
||||||
// Update current metrics and add to history
|
// Cleanup old data
|
||||||
for metric in metrics {
|
self.cleanup_host_data(&hostname);
|
||||||
let metric_name = metric.name.clone();
|
|
||||||
|
|
||||||
// Store current metric
|
info!("Stored structured data for {}", hostname);
|
||||||
host_metrics.insert(metric_name.clone(), metric.clone());
|
|
||||||
|
|
||||||
// Add to history
|
|
||||||
host_history.push(MetricDataPoint { received_at: now });
|
|
||||||
|
|
||||||
// Track heartbeat metrics for connectivity detection
|
|
||||||
if metric_name == "agent_heartbeat" {
|
|
||||||
self.last_heartbeat.insert(hostname.to_string(), now);
|
|
||||||
debug!("Updated heartbeat for host {}", hostname);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get metrics count before cleanup
|
|
||||||
let metrics_count = host_metrics.len();
|
|
||||||
|
|
||||||
// Cleanup old history and enforce limits
|
|
||||||
self.cleanup_host_data(hostname);
|
|
||||||
|
|
||||||
info!(
|
|
||||||
"Updated metrics for {}: {} current metrics",
|
|
||||||
hostname, metrics_count
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get current metric for a specific host
|
|
||||||
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric> {
|
|
||||||
self.current_metrics.get(hostname)?.get(metric_name)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Get all current metrics for a host as a vector
|
|
||||||
pub fn get_metrics_for_host(&self, hostname: &str) -> Vec<&Metric> {
|
|
||||||
if let Some(metrics_map) = self.current_metrics.get(hostname) {
|
/// Get current structured data for a host
|
||||||
metrics_map.values().collect()
|
pub fn get_agent_data(&self, hostname: &str) -> Option<&AgentData> {
|
||||||
} else {
|
self.current_agent_data.get(hostname)
|
||||||
Vec::new()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Get connected hosts (hosts with recent heartbeats)
|
/// Get connected hosts (hosts with recent heartbeats)
|
||||||
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
@@ -121,10 +96,10 @@ impl MetricStore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clear metrics for offline hosts
|
// Clear data for offline hosts
|
||||||
for hostname in hosts_to_cleanup {
|
for hostname in hosts_to_cleanup {
|
||||||
if let Some(metrics) = self.current_metrics.remove(&hostname) {
|
if let Some(_agent_data) = self.current_agent_data.remove(&hostname) {
|
||||||
info!("Cleared {} metrics for offline host: {}", metrics.len(), hostname);
|
info!("Cleared structured data for offline host: {}", hostname);
|
||||||
}
|
}
|
||||||
// Keep heartbeat timestamp for reconnection detection
|
// Keep heartbeat timestamp for reconnection detection
|
||||||
// Don't remove from last_heartbeat to track when host was last seen
|
// Don't remove from last_heartbeat to track when host was last seen
|
||||||
@@ -156,12 +131,8 @@ impl MetricStore {
|
|||||||
pub fn get_agent_versions(&self) -> HashMap<String, String> {
|
pub fn get_agent_versions(&self) -> HashMap<String, String> {
|
||||||
let mut versions = HashMap::new();
|
let mut versions = HashMap::new();
|
||||||
|
|
||||||
for (hostname, metrics) in &self.current_metrics {
|
for (hostname, agent_data) in &self.current_agent_data {
|
||||||
if let Some(version_metric) = metrics.get("agent_version") {
|
versions.insert(hostname.clone(), agent_data.agent_version.clone());
|
||||||
if let cm_dashboard_shared::MetricValue::String(version) = &version_metric.value {
|
|
||||||
versions.insert(hostname.clone(), version.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
versions
|
versions
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ use crate::config::DashboardConfig;
|
|||||||
use crate::metrics::MetricStore;
|
use crate::metrics::MetricStore;
|
||||||
use cm_dashboard_shared::Status;
|
use cm_dashboard_shared::Status;
|
||||||
use theme::{Components, Layout as ThemeLayout, Theme, Typography};
|
use theme::{Components, Layout as ThemeLayout, Theme, Typography};
|
||||||
use widgets::{BackupWidget, ServicesWidget, SystemWidget, Widget};
|
use widgets::{ServicesWidget, SystemWidget, Widget};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -32,8 +32,6 @@ pub struct HostWidgets {
|
|||||||
pub system_widget: SystemWidget,
|
pub system_widget: SystemWidget,
|
||||||
/// Services widget state
|
/// Services widget state
|
||||||
pub services_widget: ServicesWidget,
|
pub services_widget: ServicesWidget,
|
||||||
/// Backup widget state
|
|
||||||
pub backup_widget: BackupWidget,
|
|
||||||
/// Last update time for this host
|
/// Last update time for this host
|
||||||
pub last_update: Option<Instant>,
|
pub last_update: Option<Instant>,
|
||||||
}
|
}
|
||||||
@@ -43,7 +41,6 @@ impl HostWidgets {
|
|||||||
Self {
|
Self {
|
||||||
system_widget: SystemWidget::new(),
|
system_widget: SystemWidget::new(),
|
||||||
services_widget: ServicesWidget::new(),
|
services_widget: ServicesWidget::new(),
|
||||||
backup_widget: BackupWidget::new(),
|
|
||||||
last_update: None,
|
last_update: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -102,58 +99,16 @@ impl TuiApp {
|
|||||||
.or_insert_with(HostWidgets::new)
|
.or_insert_with(HostWidgets::new)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update widgets with metrics from store (only for current host)
|
/// Update widgets with structured data from store (only for current host)
|
||||||
pub fn update_metrics(&mut self, metric_store: &MetricStore) {
|
pub fn update_metrics(&mut self, metric_store: &MetricStore) {
|
||||||
|
|
||||||
// Check for rebuild completion by agent hash change
|
|
||||||
|
|
||||||
if let Some(hostname) = self.current_host.clone() {
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
// Only update widgets if we have metrics for this host
|
// Get structured data for this host
|
||||||
let all_metrics = metric_store.get_metrics_for_host(&hostname);
|
if let Some(agent_data) = metric_store.get_agent_data(&hostname) {
|
||||||
if !all_metrics.is_empty() {
|
|
||||||
// Single pass metric categorization for better performance
|
|
||||||
let mut cpu_metrics = Vec::new();
|
|
||||||
let mut memory_metrics = Vec::new();
|
|
||||||
let mut service_metrics = Vec::new();
|
|
||||||
let mut backup_metrics = Vec::new();
|
|
||||||
let mut nixos_metrics = Vec::new();
|
|
||||||
let mut disk_metrics = Vec::new();
|
|
||||||
|
|
||||||
for metric in all_metrics {
|
|
||||||
if metric.name.starts_with("cpu_")
|
|
||||||
|| metric.name.contains("c_state_")
|
|
||||||
|| metric.name.starts_with("process_top_") {
|
|
||||||
cpu_metrics.push(metric);
|
|
||||||
} else if metric.name.starts_with("memory_") || metric.name.starts_with("disk_tmp_") {
|
|
||||||
memory_metrics.push(metric);
|
|
||||||
} else if metric.name.starts_with("service_") {
|
|
||||||
service_metrics.push(metric);
|
|
||||||
} else if metric.name.starts_with("backup_") {
|
|
||||||
backup_metrics.push(metric);
|
|
||||||
} else if metric.name == "system_nixos_build" || metric.name == "system_active_users" || metric.name == "agent_version" {
|
|
||||||
nixos_metrics.push(metric);
|
|
||||||
} else if metric.name.starts_with("disk_") {
|
|
||||||
disk_metrics.push(metric);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Now get host widgets and update them
|
|
||||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||||
|
|
||||||
// Collect all system metrics (CPU, memory, NixOS, disk/storage)
|
// Update all widgets with structured data directly
|
||||||
let mut system_metrics = cpu_metrics;
|
host_widgets.system_widget.update_from_agent_data(agent_data);
|
||||||
system_metrics.extend(memory_metrics);
|
host_widgets.services_widget.update_from_agent_data(agent_data);
|
||||||
system_metrics.extend(nixos_metrics);
|
|
||||||
system_metrics.extend(disk_metrics);
|
|
||||||
|
|
||||||
host_widgets.system_widget.update_from_metrics(&system_metrics);
|
|
||||||
host_widgets
|
|
||||||
.services_widget
|
|
||||||
.update_from_metrics(&service_metrics);
|
|
||||||
host_widgets
|
|
||||||
.backup_widget
|
|
||||||
.update_from_metrics(&backup_metrics);
|
|
||||||
|
|
||||||
host_widgets.last_update = Some(Instant::now());
|
host_widgets.last_update = Some(Instant::now());
|
||||||
}
|
}
|
||||||
@@ -510,40 +465,17 @@ impl TuiApp {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if backup panel should be shown
|
// Left side: system panel only (full height)
|
||||||
let show_backup = if let Some(hostname) = self.current_host.clone() {
|
let left_chunks = ratatui::layout::Layout::default()
|
||||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
.direction(Direction::Vertical)
|
||||||
host_widgets.backup_widget.has_data()
|
.constraints([Constraint::Percentage(100)]) // System section takes full height
|
||||||
} else {
|
.split(content_chunks[0]);
|
||||||
false
|
|
||||||
};
|
|
||||||
|
|
||||||
// Left side: dynamic layout based on backup data availability
|
|
||||||
let left_chunks = if show_backup {
|
|
||||||
// Show both system and backup panels
|
|
||||||
ratatui::layout::Layout::default()
|
|
||||||
.direction(Direction::Vertical)
|
|
||||||
.constraints([
|
|
||||||
Constraint::Percentage(ThemeLayout::SYSTEM_PANEL_HEIGHT), // System section
|
|
||||||
Constraint::Percentage(ThemeLayout::BACKUP_PANEL_HEIGHT), // Backup section
|
|
||||||
])
|
|
||||||
.split(content_chunks[0])
|
|
||||||
} else {
|
|
||||||
// Show only system panel (full height)
|
|
||||||
ratatui::layout::Layout::default()
|
|
||||||
.direction(Direction::Vertical)
|
|
||||||
.constraints([Constraint::Percentage(100)]) // System section takes full height
|
|
||||||
.split(content_chunks[0])
|
|
||||||
};
|
|
||||||
|
|
||||||
// Render title bar
|
// Render title bar
|
||||||
self.render_btop_title(frame, main_chunks[0], metric_store);
|
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||||
|
|
||||||
// Render new panel layout
|
// Render system panel
|
||||||
self.render_system_panel(frame, left_chunks[0], metric_store);
|
self.render_system_panel(frame, left_chunks[0], metric_store);
|
||||||
if show_backup && left_chunks.len() > 1 {
|
|
||||||
self.render_backup_panel(frame, left_chunks[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Render services widget for current host
|
// Render services widget for current host
|
||||||
if let Some(hostname) = self.current_host.clone() {
|
if let Some(hostname) = self.current_host.clone() {
|
||||||
@@ -654,40 +586,14 @@ impl TuiApp {
|
|||||||
frame.render_widget(host_title, chunks[1]);
|
frame.render_widget(host_title, chunks[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate overall status for a host based on its metrics
|
/// Calculate overall status for a host based on its structured data
|
||||||
fn calculate_host_status(&self, hostname: &str, metric_store: &MetricStore) -> Status {
|
fn calculate_host_status(&self, hostname: &str, metric_store: &MetricStore) -> Status {
|
||||||
let metrics = metric_store.get_metrics_for_host(hostname);
|
// Check if we have structured data for this host
|
||||||
|
if let Some(_agent_data) = metric_store.get_agent_data(hostname) {
|
||||||
if metrics.is_empty() {
|
// Return OK since we have data
|
||||||
return Status::Offline;
|
|
||||||
}
|
|
||||||
|
|
||||||
// First check if we have the aggregated host status summary from the agent
|
|
||||||
if let Some(host_summary_metric) = metric_store.get_metric(hostname, "host_status_summary") {
|
|
||||||
return host_summary_metric.status;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Rewritten status aggregation - only Critical, Warning, or OK for top bar
|
|
||||||
let mut has_critical = false;
|
|
||||||
let mut has_warning = false;
|
|
||||||
|
|
||||||
for metric in &metrics {
|
|
||||||
match metric.status {
|
|
||||||
Status::Critical => has_critical = true,
|
|
||||||
Status::Warning => has_warning = true,
|
|
||||||
// Treat all other statuses as OK for top bar aggregation
|
|
||||||
Status::Ok | Status::Pending | Status::Inactive | Status::Unknown => {},
|
|
||||||
Status::Offline => {}, // Ignore offline
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only return Critical, Warning, or OK - no other statuses
|
|
||||||
if has_critical {
|
|
||||||
Status::Critical
|
|
||||||
} else if has_warning {
|
|
||||||
Status::Warning
|
|
||||||
} else {
|
|
||||||
Status::Ok
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Offline
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -736,17 +642,6 @@ impl TuiApp {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn render_backup_panel(&mut self, frame: &mut Frame, area: Rect) {
|
|
||||||
let backup_block = Components::widget_block("backup");
|
|
||||||
let inner_area = backup_block.inner(area);
|
|
||||||
frame.render_widget(backup_block, area);
|
|
||||||
|
|
||||||
// Get current host widgets for backup widget
|
|
||||||
if let Some(hostname) = self.current_host.clone() {
|
|
||||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
|
||||||
host_widgets.backup_widget.render(frame, inner_area);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Render offline host message with wake-up option
|
/// Render offline host message with wake-up option
|
||||||
fn render_offline_host_message(&self, frame: &mut Frame, area: Rect) {
|
fn render_offline_host_message(&self, frame: &mut Frame, area: Rect) {
|
||||||
|
|||||||
@@ -225,9 +225,6 @@ impl Layout {
|
|||||||
pub const LEFT_PANEL_WIDTH: u16 = 45;
|
pub const LEFT_PANEL_WIDTH: u16 = 45;
|
||||||
/// Right panel percentage (services)
|
/// Right panel percentage (services)
|
||||||
pub const RIGHT_PANEL_WIDTH: u16 = 55;
|
pub const RIGHT_PANEL_WIDTH: u16 = 55;
|
||||||
/// System vs backup split (equal)
|
|
||||||
pub const SYSTEM_PANEL_HEIGHT: u16 = 50;
|
|
||||||
pub const BACKUP_PANEL_HEIGHT: u16 = 50;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Typography system
|
/// Typography system
|
||||||
|
|||||||
@@ -1,435 +0,0 @@
|
|||||||
use cm_dashboard_shared::{Metric, Status};
|
|
||||||
use ratatui::{
|
|
||||||
layout::Rect,
|
|
||||||
widgets::Paragraph,
|
|
||||||
Frame,
|
|
||||||
};
|
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
use super::Widget;
|
|
||||||
use crate::ui::theme::{StatusIcons, Typography};
|
|
||||||
|
|
||||||
/// Backup widget displaying backup status, services, and repository information
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct BackupWidget {
|
|
||||||
/// Overall backup status
|
|
||||||
overall_status: Status,
|
|
||||||
/// Last backup duration in seconds
|
|
||||||
duration_seconds: Option<i64>,
|
|
||||||
/// Last backup timestamp
|
|
||||||
last_run_timestamp: Option<i64>,
|
|
||||||
/// Total number of backup services
|
|
||||||
total_services: Option<i64>,
|
|
||||||
/// Total repository size in GB
|
|
||||||
total_repo_size_gb: Option<f32>,
|
|
||||||
/// Total disk space for backups in GB
|
|
||||||
backup_disk_total_gb: Option<f32>,
|
|
||||||
/// Used disk space for backups in GB
|
|
||||||
backup_disk_used_gb: Option<f32>,
|
|
||||||
/// Backup disk product name from SMART data
|
|
||||||
backup_disk_product_name: Option<String>,
|
|
||||||
/// Backup disk serial number from SMART data
|
|
||||||
backup_disk_serial_number: Option<String>,
|
|
||||||
/// Backup disk wear percentage from SMART data
|
|
||||||
backup_disk_wear_percent: Option<f32>,
|
|
||||||
/// Backup disk filesystem label
|
|
||||||
backup_disk_filesystem_label: Option<String>,
|
|
||||||
/// Number of completed services
|
|
||||||
services_completed_count: Option<i64>,
|
|
||||||
/// Number of failed services
|
|
||||||
services_failed_count: Option<i64>,
|
|
||||||
/// Number of disabled services
|
|
||||||
services_disabled_count: Option<i64>,
|
|
||||||
/// All individual service metrics for detailed display
|
|
||||||
service_metrics: Vec<ServiceMetricData>,
|
|
||||||
/// Last update indicator
|
|
||||||
has_data: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct ServiceMetricData {
|
|
||||||
name: String,
|
|
||||||
status: Status,
|
|
||||||
exit_code: Option<i64>,
|
|
||||||
archive_count: Option<i64>,
|
|
||||||
repo_size_gb: Option<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BackupWidget {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
overall_status: Status::Unknown,
|
|
||||||
duration_seconds: None,
|
|
||||||
last_run_timestamp: None,
|
|
||||||
total_services: None,
|
|
||||||
total_repo_size_gb: None,
|
|
||||||
backup_disk_total_gb: None,
|
|
||||||
backup_disk_used_gb: None,
|
|
||||||
backup_disk_product_name: None,
|
|
||||||
backup_disk_serial_number: None,
|
|
||||||
backup_disk_wear_percent: None,
|
|
||||||
backup_disk_filesystem_label: None,
|
|
||||||
services_completed_count: None,
|
|
||||||
services_failed_count: None,
|
|
||||||
services_disabled_count: None,
|
|
||||||
service_metrics: Vec::new(),
|
|
||||||
has_data: false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if the backup widget has any data to display
|
|
||||||
pub fn has_data(&self) -> bool {
|
|
||||||
self.has_data
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Format size with proper units (xxxkB/MB/GB/TB)
|
|
||||||
fn format_size_with_proper_units(size_gb: f32) -> String {
|
|
||||||
if size_gb >= 1000.0 {
|
|
||||||
// TB range
|
|
||||||
format!("{:.1}TB", size_gb / 1000.0)
|
|
||||||
} else if size_gb >= 1.0 {
|
|
||||||
// GB range
|
|
||||||
format!("{:.1}GB", size_gb)
|
|
||||||
} else if size_gb >= 0.001 {
|
|
||||||
// MB range (size_gb * 1024 = MB)
|
|
||||||
let size_mb = size_gb * 1024.0;
|
|
||||||
format!("{:.1}MB", size_mb)
|
|
||||||
} else if size_gb >= 0.000001 {
|
|
||||||
// kB range (size_gb * 1024 * 1024 = kB)
|
|
||||||
let size_kb = size_gb * 1024.0 * 1024.0;
|
|
||||||
format!("{:.0}kB", size_kb)
|
|
||||||
} else {
|
|
||||||
// B range (size_gb * 1024^3 = bytes)
|
|
||||||
let size_bytes = size_gb * 1024.0 * 1024.0 * 1024.0;
|
|
||||||
format!("{:.0}B", size_bytes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Extract service name from metric name (e.g., "backup_service_gitea_status" -> "gitea")
|
|
||||||
fn extract_service_name(metric_name: &str) -> Option<String> {
|
|
||||||
if metric_name.starts_with("backup_service_") {
|
|
||||||
let name_part = &metric_name[15..]; // Remove "backup_service_" prefix
|
|
||||||
|
|
||||||
// Try to extract service name by removing known suffixes
|
|
||||||
if let Some(service_name) = name_part.strip_suffix("_status") {
|
|
||||||
Some(service_name.to_string())
|
|
||||||
} else if let Some(service_name) = name_part.strip_suffix("_exit_code") {
|
|
||||||
Some(service_name.to_string())
|
|
||||||
} else if let Some(service_name) = name_part.strip_suffix("_archive_count") {
|
|
||||||
Some(service_name.to_string())
|
|
||||||
} else if let Some(service_name) = name_part.strip_suffix("_repo_size_gb") {
|
|
||||||
Some(service_name.to_string())
|
|
||||||
} else if let Some(service_name) = name_part.strip_suffix("_repo_path") {
|
|
||||||
Some(service_name.to_string())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Widget for BackupWidget {
|
|
||||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
|
||||||
debug!("Backup widget updating with {} metrics", metrics.len());
|
|
||||||
for metric in metrics {
|
|
||||||
debug!(
|
|
||||||
"Backup metric: {} = {:?} (status: {:?})",
|
|
||||||
metric.name, metric.value, metric.status
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also debug the service_data after processing
|
|
||||||
debug!("Processing individual service metrics...");
|
|
||||||
|
|
||||||
// Log how many metrics are backup service metrics
|
|
||||||
let service_metric_count = metrics
|
|
||||||
.iter()
|
|
||||||
.filter(|m| m.name.starts_with("backup_service_"))
|
|
||||||
.count();
|
|
||||||
debug!(
|
|
||||||
"Found {} backup_service_ metrics out of {} total backup metrics",
|
|
||||||
service_metric_count,
|
|
||||||
metrics.len()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Reset service metrics
|
|
||||||
self.service_metrics.clear();
|
|
||||||
let mut service_data: std::collections::HashMap<String, ServiceMetricData> =
|
|
||||||
std::collections::HashMap::new();
|
|
||||||
|
|
||||||
for metric in metrics {
|
|
||||||
match metric.name.as_str() {
|
|
||||||
"backup_overall_status" => {
|
|
||||||
let status_str = metric.value.as_string();
|
|
||||||
self.overall_status = match status_str.as_str() {
|
|
||||||
"ok" => Status::Ok,
|
|
||||||
"warning" => Status::Warning,
|
|
||||||
"critical" => Status::Critical,
|
|
||||||
_ => Status::Unknown,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
"backup_duration_seconds" => {
|
|
||||||
self.duration_seconds = metric.value.as_i64();
|
|
||||||
}
|
|
||||||
"backup_last_run_timestamp" => {
|
|
||||||
self.last_run_timestamp = metric.value.as_i64();
|
|
||||||
}
|
|
||||||
"backup_total_services" => {
|
|
||||||
self.total_services = metric.value.as_i64();
|
|
||||||
}
|
|
||||||
"backup_total_repo_size_gb" => {
|
|
||||||
self.total_repo_size_gb = metric.value.as_f32();
|
|
||||||
}
|
|
||||||
"backup_disk_total_gb" => {
|
|
||||||
self.backup_disk_total_gb = metric.value.as_f32();
|
|
||||||
}
|
|
||||||
"backup_disk_used_gb" => {
|
|
||||||
self.backup_disk_used_gb = metric.value.as_f32();
|
|
||||||
}
|
|
||||||
"backup_disk_product_name" => {
|
|
||||||
self.backup_disk_product_name = Some(metric.value.as_string());
|
|
||||||
}
|
|
||||||
"backup_disk_serial_number" => {
|
|
||||||
self.backup_disk_serial_number = Some(metric.value.as_string());
|
|
||||||
}
|
|
||||||
"backup_disk_wear_percent" => {
|
|
||||||
self.backup_disk_wear_percent = metric.value.as_f32();
|
|
||||||
}
|
|
||||||
"backup_disk_filesystem_label" => {
|
|
||||||
self.backup_disk_filesystem_label = Some(metric.value.as_string());
|
|
||||||
}
|
|
||||||
"backup_services_completed_count" => {
|
|
||||||
self.services_completed_count = metric.value.as_i64();
|
|
||||||
}
|
|
||||||
"backup_services_failed_count" => {
|
|
||||||
self.services_failed_count = metric.value.as_i64();
|
|
||||||
}
|
|
||||||
"backup_services_disabled_count" => {
|
|
||||||
self.services_disabled_count = metric.value.as_i64();
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// Handle individual service metrics
|
|
||||||
if let Some(service_name) = Self::extract_service_name(&metric.name) {
|
|
||||||
debug!(
|
|
||||||
"Extracted service name '{}' from metric '{}'",
|
|
||||||
service_name, metric.name
|
|
||||||
);
|
|
||||||
let entry = service_data.entry(service_name.clone()).or_insert_with(|| {
|
|
||||||
ServiceMetricData {
|
|
||||||
name: service_name,
|
|
||||||
status: Status::Unknown,
|
|
||||||
exit_code: None,
|
|
||||||
archive_count: None,
|
|
||||||
repo_size_gb: None,
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if metric.name.ends_with("_status") {
|
|
||||||
entry.status = metric.status;
|
|
||||||
debug!("Set status for {}: {:?}", entry.name, entry.status);
|
|
||||||
} else if metric.name.ends_with("_exit_code") {
|
|
||||||
entry.exit_code = metric.value.as_i64();
|
|
||||||
} else if metric.name.ends_with("_archive_count") {
|
|
||||||
entry.archive_count = metric.value.as_i64();
|
|
||||||
debug!(
|
|
||||||
"Set archive_count for {}: {:?}",
|
|
||||||
entry.name, entry.archive_count
|
|
||||||
);
|
|
||||||
} else if metric.name.ends_with("_repo_size_gb") {
|
|
||||||
entry.repo_size_gb = metric.value.as_f32();
|
|
||||||
debug!(
|
|
||||||
"Set repo_size_gb for {}: {:?}",
|
|
||||||
entry.name, entry.repo_size_gb
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
debug!(
|
|
||||||
"Could not extract service name from metric: {}",
|
|
||||||
metric.name
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert service data to sorted vector
|
|
||||||
let mut services: Vec<ServiceMetricData> = service_data.into_values().collect();
|
|
||||||
services.sort_by(|a, b| a.name.cmp(&b.name));
|
|
||||||
self.service_metrics = services;
|
|
||||||
|
|
||||||
// Only show backup panel if we have meaningful backup data
|
|
||||||
self.has_data = !metrics.is_empty() && (
|
|
||||||
self.last_run_timestamp.is_some() ||
|
|
||||||
self.total_repo_size_gb.is_some() ||
|
|
||||||
!self.service_metrics.is_empty()
|
|
||||||
);
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
"Backup widget updated: status={:?}, services={}, total_size={:?}GB",
|
|
||||||
self.overall_status,
|
|
||||||
self.service_metrics.len(),
|
|
||||||
self.total_repo_size_gb
|
|
||||||
);
|
|
||||||
|
|
||||||
// Debug individual service data
|
|
||||||
for service in &self.service_metrics {
|
|
||||||
debug!(
|
|
||||||
"Service {}: status={:?}, archives={:?}, size={:?}GB",
|
|
||||||
service.name, service.status, service.archive_count, service.repo_size_gb
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BackupWidget {
|
|
||||||
/// Render backup widget
|
|
||||||
pub fn render(&mut self, frame: &mut Frame, area: Rect) {
|
|
||||||
let mut lines = Vec::new();
|
|
||||||
|
|
||||||
// Latest backup section
|
|
||||||
lines.push(ratatui::text::Line::from(vec![
|
|
||||||
ratatui::text::Span::styled("Latest backup:", Typography::widget_title())
|
|
||||||
]));
|
|
||||||
|
|
||||||
// Timestamp with status icon
|
|
||||||
let timestamp_text = if let Some(timestamp) = self.last_run_timestamp {
|
|
||||||
self.format_timestamp(timestamp)
|
|
||||||
} else {
|
|
||||||
"Unknown".to_string()
|
|
||||||
};
|
|
||||||
let timestamp_spans = StatusIcons::create_status_spans(
|
|
||||||
self.overall_status,
|
|
||||||
×tamp_text
|
|
||||||
);
|
|
||||||
lines.push(ratatui::text::Line::from(timestamp_spans));
|
|
||||||
|
|
||||||
// Duration as sub-item
|
|
||||||
if let Some(duration) = self.duration_seconds {
|
|
||||||
let duration_text = self.format_duration(duration);
|
|
||||||
lines.push(ratatui::text::Line::from(vec![
|
|
||||||
ratatui::text::Span::styled(" └─ ", Typography::tree()),
|
|
||||||
ratatui::text::Span::styled(format!("Duration: {}", duration_text), Typography::secondary())
|
|
||||||
]));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Disk section
|
|
||||||
lines.push(ratatui::text::Line::from(vec![
|
|
||||||
ratatui::text::Span::styled("Disk:", Typography::widget_title())
|
|
||||||
]));
|
|
||||||
|
|
||||||
// Disk product name with status
|
|
||||||
if let Some(product) = &self.backup_disk_product_name {
|
|
||||||
let disk_spans = StatusIcons::create_status_spans(
|
|
||||||
Status::Ok, // Assuming disk is OK if we have data
|
|
||||||
product
|
|
||||||
);
|
|
||||||
lines.push(ratatui::text::Line::from(disk_spans));
|
|
||||||
|
|
||||||
// Collect sub-items to determine tree structure
|
|
||||||
let mut sub_items = Vec::new();
|
|
||||||
|
|
||||||
if let Some(serial) = &self.backup_disk_serial_number {
|
|
||||||
sub_items.push(format!("S/N: {}", serial));
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(wear) = self.backup_disk_wear_percent {
|
|
||||||
sub_items.push(format!("Wear: {:.0}%", wear));
|
|
||||||
}
|
|
||||||
|
|
||||||
if let (Some(used), Some(total)) = (self.backup_disk_used_gb, self.backup_disk_total_gb) {
|
|
||||||
let used_str = Self::format_size_with_proper_units(used);
|
|
||||||
let total_str = Self::format_size_with_proper_units(total);
|
|
||||||
sub_items.push(format!("Usage: {}/{}", used_str, total_str));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Render sub-items with proper tree structure
|
|
||||||
let num_items = sub_items.len();
|
|
||||||
for (i, item) in sub_items.into_iter().enumerate() {
|
|
||||||
let is_last = i == num_items - 1;
|
|
||||||
let tree_char = if is_last { " └─ " } else { " ├─ " };
|
|
||||||
lines.push(ratatui::text::Line::from(vec![
|
|
||||||
ratatui::text::Span::styled(tree_char, Typography::tree()),
|
|
||||||
ratatui::text::Span::styled(item, Typography::secondary())
|
|
||||||
]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repos section
|
|
||||||
lines.push(ratatui::text::Line::from(vec![
|
|
||||||
ratatui::text::Span::styled("Repos:", Typography::widget_title())
|
|
||||||
]));
|
|
||||||
|
|
||||||
// Add all repository lines (no truncation here - scroll will handle display)
|
|
||||||
for service in &self.service_metrics {
|
|
||||||
if let (Some(archives), Some(size_gb)) = (service.archive_count, service.repo_size_gb) {
|
|
||||||
let size_str = Self::format_size_with_proper_units(size_gb);
|
|
||||||
let repo_text = format!("{} ({}) {}", service.name, archives, size_str);
|
|
||||||
let repo_spans = StatusIcons::create_status_spans(service.status, &repo_text);
|
|
||||||
lines.push(ratatui::text::Line::from(repo_spans));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply scroll offset
|
|
||||||
let total_lines = lines.len();
|
|
||||||
let available_height = area.height as usize;
|
|
||||||
|
|
||||||
// Show only what fits, with "X more below" if needed
|
|
||||||
if total_lines > available_height {
|
|
||||||
let lines_for_content = available_height.saturating_sub(1); // Reserve one line for "more below"
|
|
||||||
let mut visible_lines: Vec<_> = lines
|
|
||||||
.into_iter()
|
|
||||||
.take(lines_for_content)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let hidden_below = total_lines.saturating_sub(lines_for_content);
|
|
||||||
if hidden_below > 0 {
|
|
||||||
let more_line = ratatui::text::Line::from(vec![
|
|
||||||
ratatui::text::Span::styled(format!("... {} more below", hidden_below), Typography::muted())
|
|
||||||
]);
|
|
||||||
visible_lines.push(more_line);
|
|
||||||
}
|
|
||||||
|
|
||||||
let paragraph = Paragraph::new(ratatui::text::Text::from(visible_lines));
|
|
||||||
frame.render_widget(paragraph, area);
|
|
||||||
} else {
|
|
||||||
let paragraph = Paragraph::new(ratatui::text::Text::from(lines));
|
|
||||||
frame.render_widget(paragraph, area);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BackupWidget {
|
|
||||||
/// Format timestamp for display
|
|
||||||
fn format_timestamp(&self, timestamp: i64) -> String {
|
|
||||||
let datetime = chrono::DateTime::from_timestamp(timestamp, 0)
|
|
||||||
.unwrap_or_else(|| chrono::Utc::now());
|
|
||||||
datetime.format("%Y-%m-%d %H:%M:%S").to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Format duration in seconds to human readable format
|
|
||||||
fn format_duration(&self, duration_seconds: i64) -> String {
|
|
||||||
let minutes = duration_seconds / 60;
|
|
||||||
let seconds = duration_seconds % 60;
|
|
||||||
|
|
||||||
if minutes > 0 {
|
|
||||||
format!("{}.{}m", minutes, seconds / 6) // Show 1 decimal for minutes
|
|
||||||
} else {
|
|
||||||
format!("{}s", seconds)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for BackupWidget {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
// This file is intentionally left minimal - CPU functionality is handled by the SystemWidget
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
// This file is intentionally left minimal - Memory functionality is handled by the SystemWidget
|
|
||||||
@@ -1,18 +1,13 @@
|
|||||||
use cm_dashboard_shared::Metric;
|
use cm_dashboard_shared::AgentData;
|
||||||
|
|
||||||
pub mod backup;
|
|
||||||
pub mod cpu;
|
|
||||||
pub mod memory;
|
|
||||||
pub mod services;
|
pub mod services;
|
||||||
pub mod system;
|
pub mod system;
|
||||||
|
|
||||||
pub use backup::BackupWidget;
|
|
||||||
pub use services::ServicesWidget;
|
pub use services::ServicesWidget;
|
||||||
pub use system::SystemWidget;
|
pub use system::SystemWidget;
|
||||||
|
|
||||||
/// Widget trait for UI components that display metrics
|
/// Widget trait for UI components that display structured data
|
||||||
pub trait Widget {
|
pub trait Widget {
|
||||||
/// Update widget with new metrics data
|
/// Update widget with structured agent data
|
||||||
fn update_from_metrics(&mut self, metrics: &[&Metric]);
|
fn update_from_agent_data(&mut self, agent_data: &AgentData);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use cm_dashboard_shared::{Metric, Status};
|
use cm_dashboard_shared::{Metric, Status};
|
||||||
|
use super::Widget;
|
||||||
use ratatui::{
|
use ratatui::{
|
||||||
layout::{Constraint, Direction, Layout, Rect},
|
layout::{Constraint, Direction, Layout, Rect},
|
||||||
widgets::Paragraph,
|
widgets::Paragraph,
|
||||||
@@ -7,7 +8,6 @@ use ratatui::{
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::Widget;
|
|
||||||
use crate::ui::theme::{Components, StatusIcons, Theme, Typography};
|
use crate::ui::theme::{Components, StatusIcons, Theme, Typography};
|
||||||
use ratatui::style::Style;
|
use ratatui::style::Style;
|
||||||
|
|
||||||
@@ -28,10 +28,9 @@ pub struct ServicesWidget {
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct ServiceInfo {
|
struct ServiceInfo {
|
||||||
status: String,
|
|
||||||
memory_mb: Option<f32>,
|
memory_mb: Option<f32>,
|
||||||
disk_gb: Option<f32>,
|
disk_gb: Option<f32>,
|
||||||
latency_ms: Option<f32>,
|
metrics: Vec<(String, f32, Option<String>)>, // (label, value, unit)
|
||||||
widget_status: Status,
|
widget_status: Status,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,6 +46,7 @@ impl ServicesWidget {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Extract service name and determine if it's a parent or sub-service
|
/// Extract service name and determine if it's a parent or sub-service
|
||||||
|
#[allow(dead_code)]
|
||||||
fn extract_service_info(metric_name: &str) -> Option<(String, Option<String>)> {
|
fn extract_service_info(metric_name: &str) -> Option<(String, Option<String>)> {
|
||||||
if metric_name.starts_with("service_") {
|
if metric_name.starts_with("service_") {
|
||||||
if let Some(end_pos) = metric_name
|
if let Some(end_pos) = metric_name
|
||||||
@@ -112,10 +112,15 @@ impl ServicesWidget {
|
|||||||
name.to_string()
|
name.to_string()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Parent services always show actual systemctl status
|
// Convert Status enum to display text
|
||||||
let status_str = match info.widget_status {
|
let status_str = match info.widget_status {
|
||||||
Status::Pending => "pending".to_string(),
|
Status::Ok => "active",
|
||||||
_ => info.status.clone(), // Use actual status from agent (active/inactive/failed)
|
Status::Inactive => "inactive",
|
||||||
|
Status::Critical => "failed",
|
||||||
|
Status::Pending => "pending",
|
||||||
|
Status::Warning => "warning",
|
||||||
|
Status::Unknown => "unknown",
|
||||||
|
Status::Offline => "offline",
|
||||||
};
|
};
|
||||||
|
|
||||||
format!(
|
format!(
|
||||||
@@ -152,15 +157,25 @@ impl ServicesWidget {
|
|||||||
Status::Offline => Theme::muted_text(),
|
Status::Offline => Theme::muted_text(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// For sub-services, prefer latency if available
|
// Display metrics or status for sub-services
|
||||||
let status_str = if let Some(latency) = info.latency_ms {
|
let status_str = if !info.metrics.is_empty() {
|
||||||
if latency < 0.0 {
|
// Show first metric with label and unit
|
||||||
"timeout".to_string()
|
let (label, value, unit) = &info.metrics[0];
|
||||||
} else {
|
match unit {
|
||||||
format!("{:.0}ms", latency)
|
Some(u) => format!("{}: {:.1} {}", label, value, u),
|
||||||
|
None => format!("{}: {:.1}", label, value),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
info.status.clone()
|
// Convert Status enum to display text for sub-services
|
||||||
|
match info.widget_status {
|
||||||
|
Status::Ok => "active",
|
||||||
|
Status::Inactive => "inactive",
|
||||||
|
Status::Critical => "failed",
|
||||||
|
Status::Pending => "pending",
|
||||||
|
Status::Warning => "warning",
|
||||||
|
Status::Unknown => "unknown",
|
||||||
|
Status::Offline => "offline",
|
||||||
|
}.to_string()
|
||||||
};
|
};
|
||||||
let tree_symbol = if is_last { "└─" } else { "├─" };
|
let tree_symbol = if is_last { "└─" } else { "├─" };
|
||||||
|
|
||||||
@@ -255,6 +270,59 @@ impl ServicesWidget {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Widget for ServicesWidget {
|
impl Widget for ServicesWidget {
|
||||||
|
fn update_from_agent_data(&mut self, agent_data: &cm_dashboard_shared::AgentData) {
|
||||||
|
self.has_data = true;
|
||||||
|
self.parent_services.clear();
|
||||||
|
self.sub_services.clear();
|
||||||
|
|
||||||
|
for service in &agent_data.services {
|
||||||
|
// Store parent service
|
||||||
|
let parent_info = ServiceInfo {
|
||||||
|
memory_mb: Some(service.memory_mb),
|
||||||
|
disk_gb: Some(service.disk_gb),
|
||||||
|
metrics: Vec::new(), // Parent services don't have custom metrics
|
||||||
|
widget_status: service.service_status,
|
||||||
|
};
|
||||||
|
self.parent_services.insert(service.name.clone(), parent_info);
|
||||||
|
|
||||||
|
// Process sub-services if any
|
||||||
|
if !service.sub_services.is_empty() {
|
||||||
|
let mut sub_list = Vec::new();
|
||||||
|
for sub_service in &service.sub_services {
|
||||||
|
// Convert metrics to display format
|
||||||
|
let metrics: Vec<(String, f32, Option<String>)> = sub_service.metrics.iter()
|
||||||
|
.map(|m| (m.label.clone(), m.value, m.unit.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let sub_info = ServiceInfo {
|
||||||
|
memory_mb: None, // Not used for sub-services
|
||||||
|
disk_gb: None, // Not used for sub-services
|
||||||
|
metrics,
|
||||||
|
widget_status: sub_service.service_status,
|
||||||
|
};
|
||||||
|
sub_list.push((sub_service.name.clone(), sub_info));
|
||||||
|
}
|
||||||
|
self.sub_services.insert(service.name.clone(), sub_list);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate status from all services
|
||||||
|
let mut all_statuses = Vec::new();
|
||||||
|
all_statuses.extend(self.parent_services.values().map(|info| info.widget_status));
|
||||||
|
for sub_list in self.sub_services.values() {
|
||||||
|
all_statuses.extend(sub_list.iter().map(|(_, info)| info.widget_status));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.status = if all_statuses.is_empty() {
|
||||||
|
Status::Unknown
|
||||||
|
} else {
|
||||||
|
Status::aggregate(&all_statuses)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ServicesWidget {
|
||||||
|
#[allow(dead_code)]
|
||||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||||
debug!("Services widget updating with {} metrics", metrics.len());
|
debug!("Services widget updating with {} metrics", metrics.len());
|
||||||
|
|
||||||
@@ -270,15 +338,13 @@ impl Widget for ServicesWidget {
|
|||||||
self.parent_services
|
self.parent_services
|
||||||
.entry(parent_service)
|
.entry(parent_service)
|
||||||
.or_insert(ServiceInfo {
|
.or_insert(ServiceInfo {
|
||||||
status: "unknown".to_string(),
|
|
||||||
memory_mb: None,
|
memory_mb: None,
|
||||||
disk_gb: None,
|
disk_gb: None,
|
||||||
latency_ms: None,
|
metrics: Vec::new(),
|
||||||
widget_status: Status::Unknown,
|
widget_status: Status::Unknown,
|
||||||
});
|
});
|
||||||
|
|
||||||
if metric.name.ends_with("_status") {
|
if metric.name.ends_with("_status") {
|
||||||
service_info.status = metric.value.as_string();
|
|
||||||
service_info.widget_status = metric.status;
|
service_info.widget_status = metric.status;
|
||||||
} else if metric.name.ends_with("_memory_mb") {
|
} else if metric.name.ends_with("_memory_mb") {
|
||||||
if let Some(memory) = metric.value.as_f32() {
|
if let Some(memory) = metric.value.as_f32() {
|
||||||
@@ -307,10 +373,9 @@ impl Widget for ServicesWidget {
|
|||||||
sub_service_list.push((
|
sub_service_list.push((
|
||||||
sub_name.clone(),
|
sub_name.clone(),
|
||||||
ServiceInfo {
|
ServiceInfo {
|
||||||
status: "unknown".to_string(),
|
|
||||||
memory_mb: None,
|
memory_mb: None,
|
||||||
disk_gb: None,
|
disk_gb: None,
|
||||||
latency_ms: None,
|
metrics: Vec::new(),
|
||||||
widget_status: Status::Unknown,
|
widget_status: Status::Unknown,
|
||||||
},
|
},
|
||||||
));
|
));
|
||||||
@@ -318,7 +383,6 @@ impl Widget for ServicesWidget {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if metric.name.ends_with("_status") {
|
if metric.name.ends_with("_status") {
|
||||||
sub_service_info.status = metric.value.as_string();
|
|
||||||
sub_service_info.widget_status = metric.status;
|
sub_service_info.widget_status = metric.status;
|
||||||
} else if metric.name.ends_with("_memory_mb") {
|
} else if metric.name.ends_with("_memory_mb") {
|
||||||
if let Some(memory) = metric.value.as_f32() {
|
if let Some(memory) = metric.value.as_f32() {
|
||||||
@@ -328,11 +392,6 @@ impl Widget for ServicesWidget {
|
|||||||
if let Some(disk) = metric.value.as_f32() {
|
if let Some(disk) = metric.value.as_f32() {
|
||||||
sub_service_info.disk_gb = Some(disk);
|
sub_service_info.disk_gb = Some(disk);
|
||||||
}
|
}
|
||||||
} else if metric.name.ends_with("_latency_ms") {
|
|
||||||
if let Some(latency) = metric.value.as_f32() {
|
|
||||||
sub_service_info.latency_ms = Some(latency);
|
|
||||||
sub_service_info.widget_status = metric.status;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
use cm_dashboard_shared::Status;
|
||||||
use ratatui::{
|
use ratatui::{
|
||||||
layout::Rect,
|
layout::Rect,
|
||||||
text::{Line, Span, Text},
|
text::{Line, Span, Text},
|
||||||
@@ -6,17 +6,18 @@ use ratatui::{
|
|||||||
Frame,
|
Frame,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::Widget;
|
|
||||||
use crate::ui::theme::{StatusIcons, Typography};
|
use crate::ui::theme::{StatusIcons, Typography};
|
||||||
|
|
||||||
/// System widget displaying NixOS info, CPU, RAM, and Storage in unified layout
|
/// System widget displaying NixOS info, Network, CPU, RAM, and Storage in unified layout
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SystemWidget {
|
pub struct SystemWidget {
|
||||||
// NixOS information
|
// NixOS information
|
||||||
nixos_build: Option<String>,
|
nixos_build: Option<String>,
|
||||||
config_hash: Option<String>,
|
|
||||||
agent_hash: Option<String>,
|
agent_hash: Option<String>,
|
||||||
|
|
||||||
|
// Network interfaces
|
||||||
|
network_interfaces: Vec<cm_dashboard_shared::NetworkInterfaceData>,
|
||||||
|
|
||||||
// CPU metrics
|
// CPU metrics
|
||||||
cpu_load_1min: Option<f32>,
|
cpu_load_1min: Option<f32>,
|
||||||
cpu_load_5min: Option<f32>,
|
cpu_load_5min: Option<f32>,
|
||||||
@@ -33,10 +34,23 @@ pub struct SystemWidget {
|
|||||||
tmp_total_gb: Option<f32>,
|
tmp_total_gb: Option<f32>,
|
||||||
memory_status: Status,
|
memory_status: Status,
|
||||||
tmp_status: Status,
|
tmp_status: Status,
|
||||||
|
/// All tmpfs mounts (for auto-discovery support)
|
||||||
|
tmpfs_mounts: Vec<cm_dashboard_shared::TmpfsData>,
|
||||||
|
|
||||||
// Storage metrics (collected from disk metrics)
|
// Storage metrics (collected from disk metrics)
|
||||||
storage_pools: Vec<StoragePool>,
|
storage_pools: Vec<StoragePool>,
|
||||||
|
|
||||||
|
// Backup metrics
|
||||||
|
backup_status: String,
|
||||||
|
backup_start_time_raw: Option<String>,
|
||||||
|
backup_disk_serial: Option<String>,
|
||||||
|
backup_disk_usage_percent: Option<f32>,
|
||||||
|
backup_disk_used_gb: Option<f32>,
|
||||||
|
backup_disk_total_gb: Option<f32>,
|
||||||
|
backup_disk_wear_percent: Option<f32>,
|
||||||
|
backup_disk_temperature: Option<f32>,
|
||||||
|
backup_last_size_gb: Option<f32>,
|
||||||
|
|
||||||
// Overall status
|
// Overall status
|
||||||
has_data: bool,
|
has_data: bool,
|
||||||
}
|
}
|
||||||
@@ -46,14 +60,14 @@ struct StoragePool {
|
|||||||
name: String,
|
name: String,
|
||||||
mount_point: String,
|
mount_point: String,
|
||||||
pool_type: String, // "single", "mergerfs (2+1)", "RAID5 (3+1)", etc.
|
pool_type: String, // "single", "mergerfs (2+1)", "RAID5 (3+1)", etc.
|
||||||
pool_health: Option<String>, // "healthy", "degraded", "critical", "rebuilding"
|
drives: Vec<StorageDrive>, // For physical drives
|
||||||
drives: Vec<StorageDrive>,
|
data_drives: Vec<StorageDrive>, // For MergerFS pools
|
||||||
|
parity_drives: Vec<StorageDrive>, // For MergerFS pools
|
||||||
filesystems: Vec<FileSystem>, // For physical drive pools: individual filesystem children
|
filesystems: Vec<FileSystem>, // For physical drive pools: individual filesystem children
|
||||||
usage_percent: Option<f32>,
|
usage_percent: Option<f32>,
|
||||||
used_gb: Option<f32>,
|
used_gb: Option<f32>,
|
||||||
total_gb: Option<f32>,
|
total_gb: Option<f32>,
|
||||||
status: Status,
|
status: Status,
|
||||||
health_status: Status, // Separate status for pool health vs usage
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@@ -70,7 +84,6 @@ struct FileSystem {
|
|||||||
usage_percent: Option<f32>,
|
usage_percent: Option<f32>,
|
||||||
used_gb: Option<f32>,
|
used_gb: Option<f32>,
|
||||||
total_gb: Option<f32>,
|
total_gb: Option<f32>,
|
||||||
available_gb: Option<f32>,
|
|
||||||
status: Status,
|
status: Status,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,8 +91,8 @@ impl SystemWidget {
|
|||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
nixos_build: None,
|
nixos_build: None,
|
||||||
config_hash: None,
|
|
||||||
agent_hash: None,
|
agent_hash: None,
|
||||||
|
network_interfaces: Vec::new(),
|
||||||
cpu_load_1min: None,
|
cpu_load_1min: None,
|
||||||
cpu_load_5min: None,
|
cpu_load_5min: None,
|
||||||
cpu_load_15min: None,
|
cpu_load_15min: None,
|
||||||
@@ -93,7 +106,17 @@ impl SystemWidget {
|
|||||||
tmp_total_gb: None,
|
tmp_total_gb: None,
|
||||||
memory_status: Status::Unknown,
|
memory_status: Status::Unknown,
|
||||||
tmp_status: Status::Unknown,
|
tmp_status: Status::Unknown,
|
||||||
|
tmpfs_mounts: Vec::new(),
|
||||||
storage_pools: Vec::new(),
|
storage_pools: Vec::new(),
|
||||||
|
backup_status: "unknown".to_string(),
|
||||||
|
backup_start_time_raw: None,
|
||||||
|
backup_disk_serial: None,
|
||||||
|
backup_disk_usage_percent: None,
|
||||||
|
backup_disk_used_gb: None,
|
||||||
|
backup_disk_total_gb: None,
|
||||||
|
backup_disk_wear_percent: None,
|
||||||
|
backup_disk_temperature: None,
|
||||||
|
backup_last_size_gb: None,
|
||||||
has_data: false,
|
has_data: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -126,602 +149,636 @@ impl SystemWidget {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Format /tmp usage
|
|
||||||
fn format_tmp_usage(&self) -> String {
|
|
||||||
match (self.tmp_usage_percent, self.tmp_used_gb, self.tmp_total_gb) {
|
|
||||||
(Some(pct), Some(used), Some(total)) => {
|
|
||||||
let used_str = if used < 0.1 {
|
|
||||||
format!("{:.0}B", used * 1024.0) // Show as MB if very small
|
|
||||||
} else {
|
|
||||||
format!("{:.1}GB", used)
|
|
||||||
};
|
|
||||||
format!("{:.0}% {}/{:.1}GB", pct, used_str, total)
|
|
||||||
}
|
|
||||||
_ => "—% —GB/—GB".to_string(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the current agent hash for rebuild completion detection
|
/// Get the current agent hash for rebuild completion detection
|
||||||
pub fn _get_agent_hash(&self) -> Option<&String> {
|
pub fn _get_agent_hash(&self) -> Option<&String> {
|
||||||
self.agent_hash.as_ref()
|
self.agent_hash.as_ref()
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Get mount point for a pool name
|
use super::Widget;
|
||||||
fn get_mount_point_for_pool(&self, pool_name: &str) -> String {
|
|
||||||
match pool_name {
|
impl Widget for SystemWidget {
|
||||||
"root" => "/".to_string(),
|
fn update_from_agent_data(&mut self, agent_data: &cm_dashboard_shared::AgentData) {
|
||||||
"steampool" => "/mnt/steampool".to_string(),
|
self.has_data = true;
|
||||||
"steampool_1" => "/steampool_1".to_string(),
|
|
||||||
"steampool_2" => "/steampool_2".to_string(),
|
// Extract agent version
|
||||||
_ => format!("/{}", pool_name), // Default fallback
|
self.agent_hash = Some(agent_data.agent_version.clone());
|
||||||
|
|
||||||
|
// Extract build version
|
||||||
|
self.nixos_build = agent_data.build_version.clone();
|
||||||
|
|
||||||
|
// Extract network interfaces
|
||||||
|
self.network_interfaces = agent_data.system.network.interfaces.clone();
|
||||||
|
|
||||||
|
// Extract CPU data directly
|
||||||
|
let cpu = &agent_data.system.cpu;
|
||||||
|
self.cpu_load_1min = Some(cpu.load_1min);
|
||||||
|
self.cpu_load_5min = Some(cpu.load_5min);
|
||||||
|
self.cpu_load_15min = Some(cpu.load_15min);
|
||||||
|
self.cpu_frequency = Some(cpu.frequency_mhz);
|
||||||
|
self.cpu_status = Status::Ok;
|
||||||
|
|
||||||
|
// Extract memory data directly
|
||||||
|
let memory = &agent_data.system.memory;
|
||||||
|
self.memory_usage_percent = Some(memory.usage_percent);
|
||||||
|
self.memory_used_gb = Some(memory.used_gb);
|
||||||
|
self.memory_total_gb = Some(memory.total_gb);
|
||||||
|
self.memory_status = Status::Ok;
|
||||||
|
|
||||||
|
// Store all tmpfs mounts for display
|
||||||
|
self.tmpfs_mounts = memory.tmpfs.clone();
|
||||||
|
|
||||||
|
// Extract tmpfs data (maintain backward compatibility for /tmp)
|
||||||
|
if let Some(tmp_data) = memory.tmpfs.iter().find(|t| t.mount == "/tmp") {
|
||||||
|
self.tmp_usage_percent = Some(tmp_data.usage_percent);
|
||||||
|
self.tmp_used_gb = Some(tmp_data.used_gb);
|
||||||
|
self.tmp_total_gb = Some(tmp_data.total_gb);
|
||||||
|
self.tmp_status = Status::Ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert storage data to internal format
|
||||||
|
self.update_storage_from_agent_data(agent_data);
|
||||||
|
|
||||||
|
// Extract backup data
|
||||||
|
let backup = &agent_data.backup;
|
||||||
|
self.backup_status = backup.status.clone();
|
||||||
|
self.backup_start_time_raw = backup.start_time_raw.clone();
|
||||||
|
self.backup_last_size_gb = backup.last_backup_size_gb;
|
||||||
|
|
||||||
|
if let Some(disk) = &backup.repository_disk {
|
||||||
|
self.backup_disk_serial = Some(disk.serial.clone());
|
||||||
|
self.backup_disk_usage_percent = Some(disk.usage_percent);
|
||||||
|
self.backup_disk_used_gb = Some(disk.used_gb);
|
||||||
|
self.backup_disk_total_gb = Some(disk.total_gb);
|
||||||
|
self.backup_disk_wear_percent = disk.wear_percent;
|
||||||
|
self.backup_disk_temperature = disk.temperature_celsius;
|
||||||
|
} else {
|
||||||
|
self.backup_disk_serial = None;
|
||||||
|
self.backup_disk_usage_percent = None;
|
||||||
|
self.backup_disk_used_gb = None;
|
||||||
|
self.backup_disk_total_gb = None;
|
||||||
|
self.backup_disk_wear_percent = None;
|
||||||
|
self.backup_disk_temperature = None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Parse storage metrics into pools and drives
|
impl SystemWidget {
|
||||||
fn update_storage_from_metrics(&mut self, metrics: &[&Metric]) {
|
/// Convert structured storage data to internal format
|
||||||
|
fn update_storage_from_agent_data(&mut self, agent_data: &cm_dashboard_shared::AgentData) {
|
||||||
let mut pools: std::collections::HashMap<String, StoragePool> = std::collections::HashMap::new();
|
let mut pools: std::collections::HashMap<String, StoragePool> = std::collections::HashMap::new();
|
||||||
|
|
||||||
for metric in metrics {
|
// Convert drives
|
||||||
if metric.name.starts_with("disk_") {
|
for drive in &agent_data.system.storage.drives {
|
||||||
if let Some(pool_name) = self.extract_pool_name(&metric.name) {
|
let mut pool = StoragePool {
|
||||||
let mount_point = self.get_mount_point_for_pool(&pool_name);
|
name: drive.name.clone(),
|
||||||
let pool = pools.entry(pool_name.clone()).or_insert_with(|| StoragePool {
|
mount_point: drive.name.clone(),
|
||||||
name: pool_name.clone(),
|
pool_type: "drive".to_string(),
|
||||||
mount_point: mount_point.clone(),
|
drives: Vec::new(),
|
||||||
pool_type: "single".to_string(), // Default, will be updated
|
data_drives: Vec::new(),
|
||||||
pool_health: None,
|
parity_drives: Vec::new(),
|
||||||
drives: Vec::new(),
|
filesystems: Vec::new(),
|
||||||
filesystems: Vec::new(),
|
usage_percent: None,
|
||||||
usage_percent: None,
|
used_gb: None,
|
||||||
used_gb: None,
|
total_gb: None,
|
||||||
total_gb: None,
|
status: Status::Ok,
|
||||||
status: Status::Unknown,
|
};
|
||||||
health_status: Status::Unknown,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Parse different metric types
|
// Add drive info
|
||||||
if metric.name.contains("_usage_percent") && !metric.name.contains("_fs_") {
|
let display_name = drive.serial_number.as_ref()
|
||||||
// Only use drive-level metrics for pool totals, not filesystem metrics
|
.map(|s| truncate_serial(s))
|
||||||
if let MetricValue::Float(usage) = metric.value {
|
.unwrap_or(drive.name.clone());
|
||||||
pool.usage_percent = Some(usage);
|
let storage_drive = StorageDrive {
|
||||||
pool.status = metric.status.clone();
|
name: display_name,
|
||||||
}
|
temperature: drive.temperature_celsius,
|
||||||
} else if metric.name.contains("_used_gb") && !metric.name.contains("_fs_") {
|
wear_percent: drive.wear_percent,
|
||||||
// Only use drive-level metrics for pool totals, not filesystem metrics
|
status: Status::Ok,
|
||||||
if let MetricValue::Float(used) = metric.value {
|
};
|
||||||
pool.used_gb = Some(used);
|
pool.drives.push(storage_drive);
|
||||||
}
|
|
||||||
} else if metric.name.contains("_total_gb") && !metric.name.contains("_fs_") {
|
|
||||||
// Only use drive-level metrics for pool totals, not filesystem metrics
|
|
||||||
if let MetricValue::Float(total) = metric.value {
|
|
||||||
pool.total_gb = Some(total);
|
|
||||||
}
|
|
||||||
} else if metric.name.contains("_pool_type") {
|
|
||||||
if let MetricValue::String(pool_type) = &metric.value {
|
|
||||||
pool.pool_type = pool_type.clone();
|
|
||||||
}
|
|
||||||
} else if metric.name.contains("_pool_health") {
|
|
||||||
if let MetricValue::String(health) = &metric.value {
|
|
||||||
pool.pool_health = Some(health.clone());
|
|
||||||
pool.health_status = metric.status.clone();
|
|
||||||
}
|
|
||||||
} else if metric.name.contains("_temperature") {
|
|
||||||
if let Some(drive_name) = self.extract_drive_name(&metric.name) {
|
|
||||||
// Find existing drive or create new one
|
|
||||||
let drive_exists = pool.drives.iter().any(|d| d.name == drive_name);
|
|
||||||
if !drive_exists {
|
|
||||||
pool.drives.push(StorageDrive {
|
|
||||||
name: drive_name.clone(),
|
|
||||||
temperature: None,
|
|
||||||
wear_percent: None,
|
|
||||||
status: Status::Unknown,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(drive) = pool.drives.iter_mut().find(|d| d.name == drive_name) {
|
// Calculate totals from filesystems
|
||||||
if let MetricValue::Float(temp) = metric.value {
|
let total_used: f32 = drive.filesystems.iter().map(|fs| fs.used_gb).sum();
|
||||||
drive.temperature = Some(temp);
|
let total_size: f32 = drive.filesystems.iter().map(|fs| fs.total_gb).sum();
|
||||||
drive.status = metric.status.clone();
|
let average_usage = if total_size > 0.0 { (total_used / total_size) * 100.0 } else { 0.0 };
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if metric.name.contains("_wear_percent") {
|
|
||||||
if let Some(drive_name) = self.extract_drive_name(&metric.name) {
|
|
||||||
// Find existing drive or create new one
|
|
||||||
let drive_exists = pool.drives.iter().any(|d| d.name == drive_name);
|
|
||||||
if !drive_exists {
|
|
||||||
pool.drives.push(StorageDrive {
|
|
||||||
name: drive_name.clone(),
|
|
||||||
temperature: None,
|
|
||||||
wear_percent: None,
|
|
||||||
status: Status::Unknown,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(drive) = pool.drives.iter_mut().find(|d| d.name == drive_name) {
|
pool.usage_percent = Some(average_usage);
|
||||||
if let MetricValue::Float(wear) = metric.value {
|
pool.used_gb = Some(total_used);
|
||||||
drive.wear_percent = Some(wear);
|
pool.total_gb = Some(total_size);
|
||||||
drive.status = metric.status.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if metric.name.contains("_fs_") {
|
|
||||||
// Handle filesystem metrics for physical drive pools (disk_{pool}_fs_{fs_name}_{metric})
|
|
||||||
if let (Some(fs_name), Some(metric_type)) = self.extract_filesystem_metric(&metric.name) {
|
|
||||||
// Find or create filesystem entry
|
|
||||||
let fs_exists = pool.filesystems.iter().any(|fs| {
|
|
||||||
let fs_id = if fs.mount_point == "/" {
|
|
||||||
"root".to_string()
|
|
||||||
} else {
|
|
||||||
fs.mount_point.trim_start_matches('/').replace('/', "_")
|
|
||||||
};
|
|
||||||
fs_id == fs_name
|
|
||||||
});
|
|
||||||
|
|
||||||
if !fs_exists {
|
// Add filesystems
|
||||||
// Create filesystem entry with correct mount point
|
for fs in &drive.filesystems {
|
||||||
let mount_point = if metric_type == "mount_point" {
|
let filesystem = FileSystem {
|
||||||
if let MetricValue::String(mount) = &metric.value {
|
mount_point: fs.mount.clone(),
|
||||||
mount.clone()
|
usage_percent: Some(fs.usage_percent),
|
||||||
} else {
|
used_gb: Some(fs.used_gb),
|
||||||
// Fallback: handle special cases
|
total_gb: Some(fs.total_gb),
|
||||||
if fs_name == "root" {
|
status: Status::Ok,
|
||||||
"/".to_string()
|
};
|
||||||
} else {
|
pool.filesystems.push(filesystem);
|
||||||
format!("/{}", fs_name.replace('_', "/"))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Fallback for non-mount_point metrics: generate mount point from fs_name
|
|
||||||
if fs_name == "root" {
|
|
||||||
"/".to_string()
|
|
||||||
} else {
|
|
||||||
format!("/{}", fs_name.replace('_', "/"))
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
pool.filesystems.push(FileSystem {
|
|
||||||
mount_point,
|
|
||||||
usage_percent: None,
|
|
||||||
used_gb: None,
|
|
||||||
total_gb: None,
|
|
||||||
available_gb: None,
|
|
||||||
status: Status::Unknown,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the filesystem with the metric value
|
|
||||||
if let Some(filesystem) = pool.filesystems.iter_mut().find(|fs| {
|
|
||||||
let fs_id = if fs.mount_point == "/" {
|
|
||||||
"root".to_string()
|
|
||||||
} else {
|
|
||||||
fs.mount_point.trim_start_matches('/').replace('/', "_")
|
|
||||||
};
|
|
||||||
fs_id == fs_name
|
|
||||||
}) {
|
|
||||||
match metric_type.as_str() {
|
|
||||||
"usage_percent" => {
|
|
||||||
if let MetricValue::Float(usage) = metric.value {
|
|
||||||
filesystem.usage_percent = Some(usage);
|
|
||||||
filesystem.status = metric.status.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"used_gb" => {
|
|
||||||
if let MetricValue::Float(used) = metric.value {
|
|
||||||
filesystem.used_gb = Some(used);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"total_gb" => {
|
|
||||||
if let MetricValue::Float(total) = metric.value {
|
|
||||||
filesystem.total_gb = Some(total);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"available_gb" => {
|
|
||||||
if let MetricValue::Float(available) = metric.value {
|
|
||||||
filesystem.available_gb = Some(available);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"mount_point" => {
|
|
||||||
if let MetricValue::String(mount) = &metric.value {
|
|
||||||
filesystem.mount_point = mount.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pools.insert(drive.name.clone(), pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert to sorted vec for consistent ordering
|
// Convert pools (MergerFS, RAID, etc.)
|
||||||
|
for pool in &agent_data.system.storage.pools {
|
||||||
|
// Use agent-calculated status (combined health and usage status)
|
||||||
|
let pool_status = if pool.health_status == Status::Critical || pool.usage_status == Status::Critical {
|
||||||
|
Status::Critical
|
||||||
|
} else if pool.health_status == Status::Warning || pool.usage_status == Status::Warning {
|
||||||
|
Status::Warning
|
||||||
|
} else if pool.health_status == Status::Ok && pool.usage_status == Status::Ok {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Unknown
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut storage_pool = StoragePool {
|
||||||
|
name: pool.name.clone(),
|
||||||
|
mount_point: pool.mount.clone(),
|
||||||
|
pool_type: pool.pool_type.clone(),
|
||||||
|
drives: Vec::new(),
|
||||||
|
data_drives: Vec::new(),
|
||||||
|
parity_drives: Vec::new(),
|
||||||
|
filesystems: Vec::new(),
|
||||||
|
usage_percent: Some(pool.usage_percent),
|
||||||
|
used_gb: Some(pool.used_gb),
|
||||||
|
total_gb: Some(pool.total_gb),
|
||||||
|
status: pool_status,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Add data drives - use agent-calculated status
|
||||||
|
for drive in &pool.data_drives {
|
||||||
|
// Use combined health and temperature status
|
||||||
|
let drive_status = if drive.health_status == Status::Critical || drive.temperature_status == Status::Critical {
|
||||||
|
Status::Critical
|
||||||
|
} else if drive.health_status == Status::Warning || drive.temperature_status == Status::Warning {
|
||||||
|
Status::Warning
|
||||||
|
} else if drive.health_status == Status::Ok && drive.temperature_status == Status::Ok {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Unknown
|
||||||
|
};
|
||||||
|
|
||||||
|
let display_name = drive.serial_number.as_ref()
|
||||||
|
.map(|s| truncate_serial(s))
|
||||||
|
.unwrap_or(drive.name.clone());
|
||||||
|
let storage_drive = StorageDrive {
|
||||||
|
name: display_name,
|
||||||
|
temperature: drive.temperature_celsius,
|
||||||
|
wear_percent: drive.wear_percent,
|
||||||
|
status: drive_status,
|
||||||
|
};
|
||||||
|
storage_pool.data_drives.push(storage_drive);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add parity drives - use agent-calculated status
|
||||||
|
for drive in &pool.parity_drives {
|
||||||
|
// Use combined health and temperature status
|
||||||
|
let drive_status = if drive.health_status == Status::Critical || drive.temperature_status == Status::Critical {
|
||||||
|
Status::Critical
|
||||||
|
} else if drive.health_status == Status::Warning || drive.temperature_status == Status::Warning {
|
||||||
|
Status::Warning
|
||||||
|
} else if drive.health_status == Status::Ok && drive.temperature_status == Status::Ok {
|
||||||
|
Status::Ok
|
||||||
|
} else {
|
||||||
|
Status::Unknown
|
||||||
|
};
|
||||||
|
|
||||||
|
let display_name = drive.serial_number.as_ref()
|
||||||
|
.map(|s| truncate_serial(s))
|
||||||
|
.unwrap_or(drive.name.clone());
|
||||||
|
let storage_drive = StorageDrive {
|
||||||
|
name: display_name,
|
||||||
|
temperature: drive.temperature_celsius,
|
||||||
|
wear_percent: drive.wear_percent,
|
||||||
|
status: drive_status,
|
||||||
|
};
|
||||||
|
storage_pool.parity_drives.push(storage_drive);
|
||||||
|
}
|
||||||
|
|
||||||
|
pools.insert(pool.name.clone(), storage_pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store pools
|
||||||
let mut pool_list: Vec<StoragePool> = pools.into_values().collect();
|
let mut pool_list: Vec<StoragePool> = pools.into_values().collect();
|
||||||
pool_list.sort_by(|a, b| a.name.cmp(&b.name)); // Sort alphabetically by name
|
pool_list.sort_by(|a, b| a.name.cmp(&b.name));
|
||||||
self.storage_pools = pool_list;
|
self.storage_pools = pool_list;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract pool name from disk metric name
|
|
||||||
fn extract_pool_name(&self, metric_name: &str) -> Option<String> {
|
|
||||||
// Pattern: disk_{pool_name}_{drive_name}_{metric_type}
|
|
||||||
// Since pool_name can contain underscores, work backwards from known metric suffixes
|
|
||||||
if metric_name.starts_with("disk_") {
|
|
||||||
// First try drive-specific metrics that have device names
|
|
||||||
if let Some(suffix_pos) = metric_name.rfind("_temperature")
|
|
||||||
.or_else(|| metric_name.rfind("_wear_percent"))
|
|
||||||
.or_else(|| metric_name.rfind("_health")) {
|
|
||||||
// Find the second-to-last underscore to get pool name
|
|
||||||
let before_suffix = &metric_name[..suffix_pos];
|
|
||||||
if let Some(drive_start) = before_suffix.rfind('_') {
|
|
||||||
if drive_start > 5 {
|
|
||||||
return Some(metric_name[5..drive_start].to_string()); // Skip "disk_"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Handle filesystem metrics: disk_{pool}_fs_{filesystem}_{metric}
|
|
||||||
else if metric_name.contains("_fs_") {
|
|
||||||
if let Some(fs_pos) = metric_name.find("_fs_") {
|
|
||||||
return Some(metric_name[5..fs_pos].to_string()); // Skip "disk_", extract pool name before "_fs_"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// For pool-level metrics (usage_percent, used_gb, total_gb), take everything before the metric suffix
|
|
||||||
else if let Some(suffix_pos) = metric_name.rfind("_usage_percent")
|
|
||||||
.or_else(|| metric_name.rfind("_used_gb"))
|
|
||||||
.or_else(|| metric_name.rfind("_total_gb"))
|
|
||||||
.or_else(|| metric_name.rfind("_available_gb")) {
|
|
||||||
return Some(metric_name[5..suffix_pos].to_string()); // Skip "disk_"
|
|
||||||
}
|
|
||||||
// Fallback to old behavior for unknown patterns
|
|
||||||
else if let Some(captures) = metric_name.strip_prefix("disk_") {
|
|
||||||
if let Some(pos) = captures.find('_') {
|
|
||||||
return Some(captures[..pos].to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract filesystem name and metric type from filesystem metric names
|
|
||||||
/// Pattern: disk_{pool}_fs_{filesystem_name}_{metric_type}
|
|
||||||
fn extract_filesystem_metric(&self, metric_name: &str) -> (Option<String>, Option<String>) {
|
|
||||||
if metric_name.starts_with("disk_") && metric_name.contains("_fs_") {
|
|
||||||
// Find the _fs_ part
|
|
||||||
if let Some(fs_start) = metric_name.find("_fs_") {
|
|
||||||
let after_fs = &metric_name[fs_start + 4..]; // Skip "_fs_"
|
|
||||||
|
|
||||||
// Look for known metric suffixes (these can contain underscores)
|
|
||||||
let known_suffixes = ["usage_percent", "used_gb", "total_gb", "available_gb", "mount_point"];
|
|
||||||
|
|
||||||
for suffix in known_suffixes {
|
|
||||||
if after_fs.ends_with(suffix) {
|
|
||||||
// Extract filesystem name by removing suffix and underscore
|
|
||||||
if let Some(underscore_pos) = after_fs.rfind(&format!("_{}", suffix)) {
|
|
||||||
let fs_name = after_fs[..underscore_pos].to_string();
|
|
||||||
return (Some(fs_name), Some(suffix.to_string()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(None, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract drive name from disk metric name
|
|
||||||
fn extract_drive_name(&self, metric_name: &str) -> Option<String> {
|
|
||||||
// Pattern: disk_{pool_name}_{drive_name}_{metric_type}
|
|
||||||
// Since pool_name can contain underscores, work backwards from known metric suffixes
|
|
||||||
if metric_name.starts_with("disk_") {
|
|
||||||
if let Some(suffix_pos) = metric_name.rfind("_temperature")
|
|
||||||
.or_else(|| metric_name.rfind("_wear_percent"))
|
|
||||||
.or_else(|| metric_name.rfind("_health")) {
|
|
||||||
// Find the second-to-last underscore to get the drive name
|
|
||||||
let before_suffix = &metric_name[..suffix_pos];
|
|
||||||
if let Some(drive_start) = before_suffix.rfind('_') {
|
|
||||||
return Some(before_suffix[drive_start + 1..].to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Render storage section with enhanced tree structure
|
/// Render storage section with enhanced tree structure
|
||||||
fn render_storage(&self) -> Vec<Line<'_>> {
|
fn render_storage(&self) -> Vec<Line<'_>> {
|
||||||
let mut lines = Vec::new();
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
for pool in &self.storage_pools {
|
for pool in &self.storage_pools {
|
||||||
// Pool header line with type and health
|
// Pool header line with type and health
|
||||||
let pool_label = if pool.pool_type == "single" {
|
let pool_label = if pool.pool_type == "drive" {
|
||||||
format!("{}:", pool.mount_point)
|
// For physical drives, show the drive name with temperature and wear percentage if available
|
||||||
} else {
|
// Physical drives only have one drive entry
|
||||||
format!("{} ({}):", pool.mount_point, pool.pool_type)
|
if let Some(drive) = pool.drives.first() {
|
||||||
};
|
let mut drive_details = Vec::new();
|
||||||
let pool_spans = StatusIcons::create_status_spans(
|
|
||||||
pool.health_status.clone(),
|
|
||||||
&pool_label
|
|
||||||
);
|
|
||||||
lines.push(Line::from(pool_spans));
|
|
||||||
|
|
||||||
// Pool health line (for multi-disk pools)
|
|
||||||
if pool.pool_type != "single" {
|
|
||||||
if let Some(health) = &pool.pool_health {
|
|
||||||
let health_text = match health.as_str() {
|
|
||||||
"healthy" => format!("Pool Status: {} Healthy",
|
|
||||||
if pool.drives.len() > 1 { format!("({} drives)", pool.drives.len()) } else { String::new() }),
|
|
||||||
"degraded" => "Pool Status: ⚠ Degraded".to_string(),
|
|
||||||
"critical" => "Pool Status: ✗ Critical".to_string(),
|
|
||||||
"rebuilding" => "Pool Status: ⟳ Rebuilding".to_string(),
|
|
||||||
_ => format!("Pool Status: ? {}", health),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut health_spans = vec![
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled("├─ ", Typography::tree()),
|
|
||||||
];
|
|
||||||
health_spans.extend(StatusIcons::create_status_spans(pool.health_status.clone(), &health_text));
|
|
||||||
lines.push(Line::from(health_spans));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Total usage line (always show for pools)
|
|
||||||
let usage_text = match (pool.usage_percent, pool.used_gb, pool.total_gb) {
|
|
||||||
(Some(pct), Some(used), Some(total)) => {
|
|
||||||
format!("Total: {:.0}% {:.1}GB/{:.1}GB", pct, used, total)
|
|
||||||
}
|
|
||||||
_ => "Total: —% —GB/—GB".to_string(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let has_drives = !pool.drives.is_empty();
|
|
||||||
let has_filesystems = !pool.filesystems.is_empty();
|
|
||||||
let has_children = has_drives || has_filesystems;
|
|
||||||
let tree_symbol = if has_children { "├─" } else { "└─" };
|
|
||||||
let mut usage_spans = vec![
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled(tree_symbol, Typography::tree()),
|
|
||||||
Span::raw(" "),
|
|
||||||
];
|
|
||||||
usage_spans.extend(StatusIcons::create_status_spans(pool.status.clone(), &usage_text));
|
|
||||||
lines.push(Line::from(usage_spans));
|
|
||||||
|
|
||||||
// Drive lines with enhanced grouping
|
|
||||||
if pool.pool_type != "single" && pool.drives.len() > 1 {
|
|
||||||
// Group drives by type for mergerfs pools
|
|
||||||
let (data_drives, parity_drives): (Vec<_>, Vec<_>) = pool.drives.iter().enumerate()
|
|
||||||
.partition(|(_, drive)| {
|
|
||||||
// Simple heuristic: drives with 'parity' in name or sdc (common parity drive)
|
|
||||||
!drive.name.to_lowercase().contains("parity") && drive.name != "sdc"
|
|
||||||
});
|
|
||||||
|
|
||||||
// Show data drives
|
|
||||||
if !data_drives.is_empty() && pool.pool_type.contains("mergerfs") {
|
|
||||||
lines.push(Line::from(vec![
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled("├─ ", Typography::tree()),
|
|
||||||
Span::styled("Data Disks:", Typography::secondary()),
|
|
||||||
]));
|
|
||||||
|
|
||||||
for (i, (_, drive)) in data_drives.iter().enumerate() {
|
|
||||||
let is_last = i == data_drives.len() - 1;
|
|
||||||
if is_last && parity_drives.is_empty() {
|
|
||||||
self.render_drive_line(&mut lines, drive, "│ └─");
|
|
||||||
} else {
|
|
||||||
self.render_drive_line(&mut lines, drive, "│ ├─");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Show parity drives
|
|
||||||
if !parity_drives.is_empty() && pool.pool_type.contains("mergerfs") {
|
|
||||||
lines.push(Line::from(vec![
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled("└─ ", Typography::tree()),
|
|
||||||
Span::styled("Parity:", Typography::secondary()),
|
|
||||||
]));
|
|
||||||
|
|
||||||
for (i, (_, drive)) in parity_drives.iter().enumerate() {
|
|
||||||
let is_last = i == parity_drives.len() - 1;
|
|
||||||
if is_last {
|
|
||||||
self.render_drive_line(&mut lines, drive, " └─");
|
|
||||||
} else {
|
|
||||||
self.render_drive_line(&mut lines, drive, " ├─");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Regular drive listing for non-mergerfs pools
|
|
||||||
for (i, drive) in pool.drives.iter().enumerate() {
|
|
||||||
let is_last = i == pool.drives.len() - 1;
|
|
||||||
let tree_symbol = if is_last { "└─" } else { "├─" };
|
|
||||||
self.render_drive_line(&mut lines, drive, tree_symbol);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if pool.pool_type.starts_with("drive (") {
|
|
||||||
// Physical drive pools: show drive info + filesystem children
|
|
||||||
// First show drive information
|
|
||||||
for drive in &pool.drives {
|
|
||||||
let mut drive_info = Vec::new();
|
|
||||||
if let Some(temp) = drive.temperature {
|
if let Some(temp) = drive.temperature {
|
||||||
drive_info.push(format!("T: {:.0}°C", temp));
|
drive_details.push(format!("T: {}°C", temp as i32));
|
||||||
}
|
}
|
||||||
if let Some(wear) = drive.wear_percent {
|
if let Some(wear) = drive.wear_percent {
|
||||||
drive_info.push(format!("W: {:.0}%", wear));
|
drive_details.push(format!("W: {}%", wear as i32));
|
||||||
}
|
}
|
||||||
let drive_text = if drive_info.is_empty() {
|
|
||||||
format!("Drive: {}", drive.name)
|
if !drive_details.is_empty() {
|
||||||
|
format!("{} {}", drive.name, drive_details.join(" "))
|
||||||
} else {
|
} else {
|
||||||
format!("Drive: {}", drive_info.join(" "))
|
drive.name.clone()
|
||||||
};
|
}
|
||||||
|
} else {
|
||||||
let has_filesystems = !pool.filesystems.is_empty();
|
pool.name.clone()
|
||||||
let tree_symbol = if has_filesystems { "├─" } else { "└─" };
|
|
||||||
let mut drive_spans = vec![
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled(tree_symbol, Typography::tree()),
|
|
||||||
Span::raw(" "),
|
|
||||||
];
|
|
||||||
drive_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
|
||||||
lines.push(Line::from(drive_spans));
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// For mergerfs pools, show pool type with mount point
|
||||||
|
format!("mergerfs {}:", pool.mount_point)
|
||||||
|
};
|
||||||
|
|
||||||
// Then show filesystem children
|
let pool_spans = StatusIcons::create_status_spans(pool.status.clone(), &pool_label);
|
||||||
|
lines.push(Line::from(pool_spans));
|
||||||
|
|
||||||
|
// Show individual filesystems for physical drives (matching CLAUDE.md format)
|
||||||
|
if pool.pool_type == "drive" {
|
||||||
|
// Show filesystem entries like: ├─ ● /: 55% 250.5GB/456.4GB
|
||||||
for (i, filesystem) in pool.filesystems.iter().enumerate() {
|
for (i, filesystem) in pool.filesystems.iter().enumerate() {
|
||||||
let is_last = i == pool.filesystems.len() - 1;
|
let is_last = i == pool.filesystems.len() - 1;
|
||||||
let tree_symbol = if is_last { "└─" } else { "├─" };
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
|
|
||||||
let fs_text = match (filesystem.usage_percent, filesystem.used_gb, filesystem.total_gb) {
|
let fs_text = format!("{}: {:.0}% {:.1}GB/{:.1}GB",
|
||||||
(Some(pct), Some(used), Some(total)) => {
|
filesystem.mount_point,
|
||||||
format!("{}: {:.0}% {:.1}GB/{:.1}GB", filesystem.mount_point, pct, used, total)
|
filesystem.usage_percent.unwrap_or(0.0),
|
||||||
}
|
filesystem.used_gb.unwrap_or(0.0),
|
||||||
(Some(pct), _, Some(total)) => {
|
filesystem.total_gb.unwrap_or(0.0));
|
||||||
format!("{}: {:.0}% —GB/{:.1}GB", filesystem.mount_point, pct, total)
|
|
||||||
}
|
|
||||||
(Some(pct), _, _) => {
|
|
||||||
format!("{}: {:.0}% —GB/—GB", filesystem.mount_point, pct)
|
|
||||||
}
|
|
||||||
(_, Some(used), Some(total)) => {
|
|
||||||
format!("{}: —% {:.1}GB/{:.1}GB", filesystem.mount_point, used, total)
|
|
||||||
}
|
|
||||||
_ => format!("{}: —% —GB/—GB", filesystem.mount_point),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut fs_spans = vec![
|
let mut fs_spans = vec![
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled(tree_symbol, Typography::tree()),
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
Span::raw(" "),
|
|
||||||
];
|
];
|
||||||
fs_spans.extend(StatusIcons::create_status_spans(filesystem.status.clone(), &fs_text));
|
fs_spans.extend(StatusIcons::create_status_spans(
|
||||||
|
filesystem.status.clone(),
|
||||||
|
&fs_text
|
||||||
|
));
|
||||||
lines.push(Line::from(fs_spans));
|
lines.push(Line::from(fs_spans));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Single drive or simple pools
|
// For mergerfs pools, show structure matching CLAUDE.md format:
|
||||||
for (i, drive) in pool.drives.iter().enumerate() {
|
// ● mergerfs (2+1):
|
||||||
let is_last = i == pool.drives.len() - 1;
|
// ├─ Total: ● 63% 2355.2GB/3686.4GB
|
||||||
let tree_symbol = if is_last { "└─" } else { "├─" };
|
// ├─ Data Disks:
|
||||||
self.render_drive_line(&mut lines, drive, tree_symbol);
|
// │ ├─ ● sdb T: 24°C W: 5%
|
||||||
|
// │ └─ ● sdd T: 27°C W: 5%
|
||||||
|
// ├─ Parity: ● sdc T: 24°C W: 5%
|
||||||
|
// └─ Mount: /srv/media
|
||||||
|
|
||||||
|
// Pool total usage
|
||||||
|
let total_text = format!("{:.0}% {:.1}GB/{:.1}GB",
|
||||||
|
pool.usage_percent.unwrap_or(0.0),
|
||||||
|
pool.used_gb.unwrap_or(0.0),
|
||||||
|
pool.total_gb.unwrap_or(0.0)
|
||||||
|
);
|
||||||
|
let mut total_spans = vec![
|
||||||
|
Span::styled(" ├─ ", Typography::tree()),
|
||||||
|
];
|
||||||
|
total_spans.extend(StatusIcons::create_status_spans(Status::Ok, &total_text));
|
||||||
|
lines.push(Line::from(total_spans));
|
||||||
|
|
||||||
|
// Data drives - at same level as parity
|
||||||
|
let has_parity = !pool.parity_drives.is_empty();
|
||||||
|
for (i, drive) in pool.data_drives.iter().enumerate() {
|
||||||
|
let is_last_data = i == pool.data_drives.len() - 1;
|
||||||
|
let mut drive_details = Vec::new();
|
||||||
|
if let Some(temp) = drive.temperature {
|
||||||
|
drive_details.push(format!("T: {}°C", temp as i32));
|
||||||
|
}
|
||||||
|
if let Some(wear) = drive.wear_percent {
|
||||||
|
drive_details.push(format!("W: {}%", wear as i32));
|
||||||
|
}
|
||||||
|
|
||||||
|
let drive_text = if !drive_details.is_empty() {
|
||||||
|
format!("Data_{}: {} {}", i + 1, drive.name, drive_details.join(" "))
|
||||||
|
} else {
|
||||||
|
format!("Data_{}: {}", i + 1, drive.name)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Last data drive uses └─ if there's no parity, otherwise ├─
|
||||||
|
let tree_symbol = if is_last_data && !has_parity { " └─ " } else { " ├─ " };
|
||||||
|
let mut data_spans = vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
];
|
||||||
|
data_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
||||||
|
lines.push(Line::from(data_spans));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parity drives - last item(s)
|
||||||
|
if !pool.parity_drives.is_empty() {
|
||||||
|
for (i, drive) in pool.parity_drives.iter().enumerate() {
|
||||||
|
let is_last = i == pool.parity_drives.len() - 1;
|
||||||
|
let mut drive_details = Vec::new();
|
||||||
|
if let Some(temp) = drive.temperature {
|
||||||
|
drive_details.push(format!("T: {}°C", temp as i32));
|
||||||
|
}
|
||||||
|
if let Some(wear) = drive.wear_percent {
|
||||||
|
drive_details.push(format!("W: {}%", wear as i32));
|
||||||
|
}
|
||||||
|
|
||||||
|
let drive_text = if !drive_details.is_empty() {
|
||||||
|
format!("Parity: {} {}", drive.name, drive_details.join(" "))
|
||||||
|
} else {
|
||||||
|
format!("Parity: {}", drive.name)
|
||||||
|
};
|
||||||
|
|
||||||
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
|
let mut parity_spans = vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
];
|
||||||
|
parity_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
||||||
|
lines.push(Line::from(parity_spans));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lines
|
lines
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper to render a single drive line
|
|
||||||
fn render_drive_line<'a>(&self, lines: &mut Vec<Line<'a>>, drive: &StorageDrive, tree_symbol: &'a str) {
|
|
||||||
let mut drive_info = Vec::new();
|
|
||||||
if let Some(temp) = drive.temperature {
|
|
||||||
drive_info.push(format!("T: {:.0}°C", temp));
|
|
||||||
}
|
|
||||||
if let Some(wear) = drive.wear_percent {
|
|
||||||
drive_info.push(format!("W: {:.0}%", wear));
|
|
||||||
}
|
|
||||||
let drive_text = if drive_info.is_empty() {
|
|
||||||
drive.name.clone()
|
|
||||||
} else {
|
|
||||||
format!("{} {}", drive.name, drive_info.join(" • "))
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut drive_spans = vec![
|
|
||||||
Span::raw(" "),
|
|
||||||
Span::styled(tree_symbol, Typography::tree()),
|
|
||||||
Span::raw(" "),
|
|
||||||
];
|
|
||||||
drive_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
|
||||||
lines.push(Line::from(drive_spans));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Widget for SystemWidget {
|
/// Truncate serial number to last 8 characters
|
||||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
fn truncate_serial(serial: &str) -> String {
|
||||||
self.has_data = !metrics.is_empty();
|
let len = serial.len();
|
||||||
|
if len > 8 {
|
||||||
for metric in metrics {
|
serial[len - 8..].to_string()
|
||||||
match metric.name.as_str() {
|
} else {
|
||||||
// NixOS metrics
|
serial.to_string()
|
||||||
"system_nixos_build" => {
|
|
||||||
if let MetricValue::String(build) = &metric.value {
|
|
||||||
self.nixos_build = Some(build.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"system_config_hash" => {
|
|
||||||
if let MetricValue::String(hash) = &metric.value {
|
|
||||||
self.config_hash = Some(hash.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"agent_version" => {
|
|
||||||
if let MetricValue::String(version) = &metric.value {
|
|
||||||
self.agent_hash = Some(version.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// CPU metrics
|
|
||||||
"cpu_load_1min" => {
|
|
||||||
if let MetricValue::Float(load) = metric.value {
|
|
||||||
self.cpu_load_1min = Some(load);
|
|
||||||
self.cpu_status = metric.status.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"cpu_load_5min" => {
|
|
||||||
if let MetricValue::Float(load) = metric.value {
|
|
||||||
self.cpu_load_5min = Some(load);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"cpu_load_15min" => {
|
|
||||||
if let MetricValue::Float(load) = metric.value {
|
|
||||||
self.cpu_load_15min = Some(load);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"cpu_frequency_mhz" => {
|
|
||||||
if let MetricValue::Float(freq) = metric.value {
|
|
||||||
self.cpu_frequency = Some(freq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Memory metrics
|
|
||||||
"memory_usage_percent" => {
|
|
||||||
if let MetricValue::Float(usage) = metric.value {
|
|
||||||
self.memory_usage_percent = Some(usage);
|
|
||||||
self.memory_status = metric.status.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"memory_used_gb" => {
|
|
||||||
if let MetricValue::Float(used) = metric.value {
|
|
||||||
self.memory_used_gb = Some(used);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"memory_total_gb" => {
|
|
||||||
if let MetricValue::Float(total) = metric.value {
|
|
||||||
self.memory_total_gb = Some(total);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tmpfs metrics
|
|
||||||
"memory_tmp_usage_percent" => {
|
|
||||||
if let MetricValue::Float(usage) = metric.value {
|
|
||||||
self.tmp_usage_percent = Some(usage);
|
|
||||||
self.tmp_status = metric.status.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"memory_tmp_used_gb" => {
|
|
||||||
if let MetricValue::Float(used) = metric.value {
|
|
||||||
self.tmp_used_gb = Some(used);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"memory_tmp_total_gb" => {
|
|
||||||
if let MetricValue::Float(total) = metric.value {
|
|
||||||
self.tmp_total_gb = Some(total);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update storage from all disk metrics
|
|
||||||
self.update_storage_from_metrics(metrics);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SystemWidget {
|
impl SystemWidget {
|
||||||
|
/// Render backup section for display
|
||||||
|
fn render_backup(&self) -> Vec<Line<'_>> {
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
// First line: serial number with temperature and wear
|
||||||
|
if let Some(serial) = &self.backup_disk_serial {
|
||||||
|
let truncated_serial = truncate_serial(serial);
|
||||||
|
let mut details = Vec::new();
|
||||||
|
if let Some(temp) = self.backup_disk_temperature {
|
||||||
|
details.push(format!("T: {}°C", temp as i32));
|
||||||
|
}
|
||||||
|
if let Some(wear) = self.backup_disk_wear_percent {
|
||||||
|
details.push(format!("W: {}%", wear as i32));
|
||||||
|
}
|
||||||
|
|
||||||
|
let disk_text = if !details.is_empty() {
|
||||||
|
format!("{} {}", truncated_serial, details.join(" "))
|
||||||
|
} else {
|
||||||
|
truncated_serial
|
||||||
|
};
|
||||||
|
|
||||||
|
let backup_status = match self.backup_status.as_str() {
|
||||||
|
"completed" | "success" => Status::Ok,
|
||||||
|
"running" => Status::Pending,
|
||||||
|
"failed" => Status::Critical,
|
||||||
|
_ => Status::Unknown,
|
||||||
|
};
|
||||||
|
|
||||||
|
let disk_spans = StatusIcons::create_status_spans(backup_status, &disk_text);
|
||||||
|
lines.push(Line::from(disk_spans));
|
||||||
|
|
||||||
|
// Show backup time from TOML if available
|
||||||
|
if let Some(start_time) = &self.backup_start_time_raw {
|
||||||
|
let time_text = if let Some(size) = self.backup_last_size_gb {
|
||||||
|
format!("Time: {} ({:.1}GB)", start_time, size)
|
||||||
|
} else {
|
||||||
|
format!("Time: {}", start_time)
|
||||||
|
};
|
||||||
|
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(" ├─ ", Typography::tree()),
|
||||||
|
Span::styled(time_text, Typography::secondary())
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Usage information
|
||||||
|
if let (Some(used), Some(total), Some(usage_percent)) = (
|
||||||
|
self.backup_disk_used_gb,
|
||||||
|
self.backup_disk_total_gb,
|
||||||
|
self.backup_disk_usage_percent
|
||||||
|
) {
|
||||||
|
let usage_text = format!("Usage: {:.0}% {:.0}GB/{:.0}GB", usage_percent, used, total);
|
||||||
|
let usage_spans = StatusIcons::create_status_spans(Status::Ok, &usage_text);
|
||||||
|
let mut full_spans = vec![
|
||||||
|
Span::styled(" └─ ", Typography::tree()),
|
||||||
|
];
|
||||||
|
full_spans.extend(usage_spans);
|
||||||
|
lines.push(Line::from(full_spans));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compress IPv4 addresses from same subnet
|
||||||
|
/// Example: "192.168.30.1, 192.168.30.100" -> "192.168.30.1, 100"
|
||||||
|
fn compress_ipv4_addresses(addresses: &[String]) -> String {
|
||||||
|
if addresses.is_empty() {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
if addresses.len() == 1 {
|
||||||
|
return addresses[0].clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut result = Vec::new();
|
||||||
|
let mut last_prefix = String::new();
|
||||||
|
|
||||||
|
for addr in addresses {
|
||||||
|
let parts: Vec<&str> = addr.split('.').collect();
|
||||||
|
if parts.len() == 4 {
|
||||||
|
let prefix = format!("{}.{}.{}", parts[0], parts[1], parts[2]);
|
||||||
|
|
||||||
|
if prefix == last_prefix {
|
||||||
|
// Same subnet, show only last octet
|
||||||
|
result.push(parts[3].to_string());
|
||||||
|
} else {
|
||||||
|
// Different subnet, show full IP
|
||||||
|
result.push(addr.clone());
|
||||||
|
last_prefix = prefix;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Invalid IP format, show as-is
|
||||||
|
result.push(addr.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.join(", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render network section for display with physical/virtual grouping
|
||||||
|
fn render_network(&self) -> Vec<Line<'_>> {
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
if self.network_interfaces.is_empty() {
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Separate physical and virtual interfaces
|
||||||
|
let physical: Vec<_> = self.network_interfaces.iter().filter(|i| i.is_physical).collect();
|
||||||
|
let virtual_interfaces: Vec<_> = self.network_interfaces.iter().filter(|i| !i.is_physical).collect();
|
||||||
|
|
||||||
|
// Find standalone virtual interfaces (those without a parent)
|
||||||
|
let mut standalone_virtual: Vec<_> = virtual_interfaces.iter()
|
||||||
|
.filter(|i| i.parent_interface.is_none())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sort standalone virtual: VLANs first (by VLAN ID), then others alphabetically
|
||||||
|
standalone_virtual.sort_by(|a, b| {
|
||||||
|
match (a.vlan_id, b.vlan_id) {
|
||||||
|
(Some(vlan_a), Some(vlan_b)) => vlan_a.cmp(&vlan_b),
|
||||||
|
(Some(_), None) => std::cmp::Ordering::Less,
|
||||||
|
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||||
|
(None, None) => a.name.cmp(&b.name),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Render physical interfaces with their children
|
||||||
|
for (phy_idx, interface) in physical.iter().enumerate() {
|
||||||
|
let is_last_physical = phy_idx == physical.len() - 1 && standalone_virtual.is_empty();
|
||||||
|
|
||||||
|
// Physical interface header with status icon
|
||||||
|
let mut header_spans = vec![];
|
||||||
|
header_spans.extend(StatusIcons::create_status_spans(
|
||||||
|
interface.link_status.clone(),
|
||||||
|
&format!("{}:", interface.name)
|
||||||
|
));
|
||||||
|
lines.push(Line::from(header_spans));
|
||||||
|
|
||||||
|
// Find child interfaces for this physical interface
|
||||||
|
let mut children: Vec<_> = virtual_interfaces.iter()
|
||||||
|
.filter(|vi| {
|
||||||
|
if let Some(parent) = &vi.parent_interface {
|
||||||
|
parent == &interface.name
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sort children: VLANs first (by VLAN ID), then others alphabetically
|
||||||
|
children.sort_by(|a, b| {
|
||||||
|
match (a.vlan_id, b.vlan_id) {
|
||||||
|
(Some(vlan_a), Some(vlan_b)) => vlan_a.cmp(&vlan_b),
|
||||||
|
(Some(_), None) => std::cmp::Ordering::Less,
|
||||||
|
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||||
|
(None, None) => a.name.cmp(&b.name),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Count total items under this physical interface (IPs + children)
|
||||||
|
let ip_count = interface.ipv4_addresses.len() + interface.ipv6_addresses.len();
|
||||||
|
let total_children = ip_count + children.len();
|
||||||
|
let mut child_index = 0;
|
||||||
|
|
||||||
|
// IPv4 addresses on the physical interface itself
|
||||||
|
for ipv4 in &interface.ipv4_addresses {
|
||||||
|
child_index += 1;
|
||||||
|
let is_last = child_index == total_children && is_last_physical;
|
||||||
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
Span::styled(format!("ip: {}", ipv4), Typography::secondary()),
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// IPv6 addresses on the physical interface itself
|
||||||
|
for ipv6 in &interface.ipv6_addresses {
|
||||||
|
child_index += 1;
|
||||||
|
let is_last = child_index == total_children && is_last_physical;
|
||||||
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
Span::styled(format!("ip: {}", ipv6), Typography::secondary()),
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Child virtual interfaces (VLANs, etc.)
|
||||||
|
for child in children {
|
||||||
|
child_index += 1;
|
||||||
|
let is_last = child_index == total_children && is_last_physical;
|
||||||
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
|
|
||||||
|
let ip_text = if !child.ipv4_addresses.is_empty() {
|
||||||
|
Self::compress_ipv4_addresses(&child.ipv4_addresses)
|
||||||
|
} else if !child.ipv6_addresses.is_empty() {
|
||||||
|
child.ipv6_addresses.join(", ")
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Format: "name (vlan X): IP" or "name: IP"
|
||||||
|
let child_text = if let Some(vlan_id) = child.vlan_id {
|
||||||
|
if !ip_text.is_empty() {
|
||||||
|
format!("{} (vlan {}): {}", child.name, vlan_id, ip_text)
|
||||||
|
} else {
|
||||||
|
format!("{} (vlan {}):", child.name, vlan_id)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !ip_text.is_empty() {
|
||||||
|
format!("{}: {}", child.name, ip_text)
|
||||||
|
} else {
|
||||||
|
format!("{}:", child.name)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
Span::styled(child_text, Typography::secondary()),
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render standalone virtual interfaces (those without a parent)
|
||||||
|
for (virt_idx, interface) in standalone_virtual.iter().enumerate() {
|
||||||
|
let is_last = virt_idx == standalone_virtual.len() - 1;
|
||||||
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
|
|
||||||
|
// Virtual interface with IPs
|
||||||
|
let ip_text = if !interface.ipv4_addresses.is_empty() {
|
||||||
|
Self::compress_ipv4_addresses(&interface.ipv4_addresses)
|
||||||
|
} else if !interface.ipv6_addresses.is_empty() {
|
||||||
|
interface.ipv6_addresses.join(", ")
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Format: "name (vlan X): IP" or "name: IP"
|
||||||
|
let interface_text = if let Some(vlan_id) = interface.vlan_id {
|
||||||
|
if !ip_text.is_empty() {
|
||||||
|
format!("{} (vlan {}): {}", interface.name, vlan_id, ip_text)
|
||||||
|
} else {
|
||||||
|
format!("{} (vlan {}):", interface.name, vlan_id)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !ip_text.is_empty() {
|
||||||
|
format!("{}: {}", interface.name, ip_text)
|
||||||
|
} else {
|
||||||
|
format!("{}:", interface.name)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
Span::styled(interface_text, Typography::secondary()),
|
||||||
|
]));
|
||||||
|
}
|
||||||
|
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
|
||||||
/// Render system widget
|
/// Render system widget
|
||||||
pub fn render(&mut self, frame: &mut Frame, area: Rect, hostname: &str, config: Option<&crate::config::DashboardConfig>) {
|
pub fn render(&mut self, frame: &mut Frame, area: Rect, hostname: &str, _config: Option<&crate::config::DashboardConfig>) {
|
||||||
let mut lines = Vec::new();
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
// NixOS section
|
// NixOS section
|
||||||
@@ -739,17 +796,6 @@ impl SystemWidget {
|
|||||||
Span::styled(format!("Agent: {}", agent_version_text), Typography::secondary())
|
Span::styled(format!("Agent: {}", agent_version_text), Typography::secondary())
|
||||||
]));
|
]));
|
||||||
|
|
||||||
// Display detected connection IP
|
|
||||||
if let Some(config) = config {
|
|
||||||
if let Some(host_details) = config.hosts.get(hostname) {
|
|
||||||
let detected_ip = host_details.get_connection_ip(hostname);
|
|
||||||
lines.push(Line::from(vec![
|
|
||||||
Span::styled(format!("IP: {}", detected_ip), Typography::secondary())
|
|
||||||
]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// CPU section
|
// CPU section
|
||||||
lines.push(Line::from(vec![
|
lines.push(Line::from(vec![
|
||||||
Span::styled("CPU:", Typography::widget_title())
|
Span::styled("CPU:", Typography::widget_title())
|
||||||
@@ -780,15 +826,39 @@ impl SystemWidget {
|
|||||||
);
|
);
|
||||||
lines.push(Line::from(memory_spans));
|
lines.push(Line::from(memory_spans));
|
||||||
|
|
||||||
let tmp_text = self.format_tmp_usage();
|
// Display all tmpfs mounts
|
||||||
let mut tmp_spans = vec![
|
for (i, tmpfs) in self.tmpfs_mounts.iter().enumerate() {
|
||||||
Span::styled(" └─ ", Typography::tree()),
|
let is_last = i == self.tmpfs_mounts.len() - 1;
|
||||||
];
|
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||||
tmp_spans.extend(StatusIcons::create_status_spans(
|
|
||||||
self.tmp_status.clone(),
|
let usage_text = if tmpfs.total_gb > 0.0 {
|
||||||
&format!("/tmp: {}", tmp_text)
|
format!("{:.0}% {:.1}GB/{:.1}GB",
|
||||||
));
|
tmpfs.usage_percent,
|
||||||
lines.push(Line::from(tmp_spans));
|
tmpfs.used_gb,
|
||||||
|
tmpfs.total_gb)
|
||||||
|
} else {
|
||||||
|
"— —/—".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tmpfs_spans = vec![
|
||||||
|
Span::styled(tree_symbol, Typography::tree()),
|
||||||
|
];
|
||||||
|
tmpfs_spans.extend(StatusIcons::create_status_spans(
|
||||||
|
Status::Ok, // TODO: Calculate status based on usage_percent
|
||||||
|
&format!("{}: {}", tmpfs.mount, usage_text)
|
||||||
|
));
|
||||||
|
lines.push(Line::from(tmpfs_spans));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Network section
|
||||||
|
if !self.network_interfaces.is_empty() {
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled("Network:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
let network_lines = self.render_network();
|
||||||
|
lines.extend(network_lines);
|
||||||
|
}
|
||||||
|
|
||||||
// Storage section
|
// Storage section
|
||||||
lines.push(Line::from(vec![
|
lines.push(Line::from(vec![
|
||||||
@@ -799,6 +869,16 @@ impl SystemWidget {
|
|||||||
let storage_lines = self.render_storage();
|
let storage_lines = self.render_storage();
|
||||||
lines.extend(storage_lines);
|
lines.extend(storage_lines);
|
||||||
|
|
||||||
|
// Backup section (if available)
|
||||||
|
if self.backup_status != "unavailable" && self.backup_status != "unknown" {
|
||||||
|
lines.push(Line::from(vec![
|
||||||
|
Span::styled("Backup:", Typography::widget_title())
|
||||||
|
]));
|
||||||
|
|
||||||
|
let backup_lines = self.render_backup();
|
||||||
|
lines.extend(backup_lines);
|
||||||
|
}
|
||||||
|
|
||||||
// Apply scroll offset
|
// Apply scroll offset
|
||||||
let total_lines = lines.len();
|
let total_lines = lines.len();
|
||||||
let available_height = area.height as usize;
|
let available_height = area.height as usize;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "cm-dashboard-shared"
|
name = "cm-dashboard-shared"
|
||||||
version = "0.1.118"
|
version = "0.1.181"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|||||||
231
shared/src/agent_data.rs
Normal file
231
shared/src/agent_data.rs
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use crate::Status;
|
||||||
|
|
||||||
|
/// Complete structured data from an agent
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct AgentData {
|
||||||
|
pub hostname: String,
|
||||||
|
pub agent_version: String,
|
||||||
|
pub build_version: Option<String>,
|
||||||
|
pub timestamp: u64,
|
||||||
|
pub system: SystemData,
|
||||||
|
pub services: Vec<ServiceData>,
|
||||||
|
pub backup: BackupData,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// System-level monitoring data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SystemData {
|
||||||
|
pub network: NetworkData,
|
||||||
|
pub cpu: CpuData,
|
||||||
|
pub memory: MemoryData,
|
||||||
|
pub storage: StorageData,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Network interface monitoring data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct NetworkData {
|
||||||
|
pub interfaces: Vec<NetworkInterfaceData>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Individual network interface data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct NetworkInterfaceData {
|
||||||
|
pub name: String,
|
||||||
|
pub ipv4_addresses: Vec<String>,
|
||||||
|
pub ipv6_addresses: Vec<String>,
|
||||||
|
pub is_physical: bool,
|
||||||
|
pub link_status: Status,
|
||||||
|
pub parent_interface: Option<String>,
|
||||||
|
pub vlan_id: Option<u16>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CPU monitoring data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CpuData {
|
||||||
|
pub load_1min: f32,
|
||||||
|
pub load_5min: f32,
|
||||||
|
pub load_15min: f32,
|
||||||
|
pub frequency_mhz: f32,
|
||||||
|
pub temperature_celsius: Option<f32>,
|
||||||
|
pub load_status: Status,
|
||||||
|
pub temperature_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Memory monitoring data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct MemoryData {
|
||||||
|
pub usage_percent: f32,
|
||||||
|
pub total_gb: f32,
|
||||||
|
pub used_gb: f32,
|
||||||
|
pub available_gb: f32,
|
||||||
|
pub swap_total_gb: f32,
|
||||||
|
pub swap_used_gb: f32,
|
||||||
|
pub tmpfs: Vec<TmpfsData>,
|
||||||
|
pub usage_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tmpfs filesystem data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct TmpfsData {
|
||||||
|
pub mount: String,
|
||||||
|
pub usage_percent: f32,
|
||||||
|
pub used_gb: f32,
|
||||||
|
pub total_gb: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Storage monitoring data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct StorageData {
|
||||||
|
pub drives: Vec<DriveData>,
|
||||||
|
pub pools: Vec<PoolData>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Individual drive data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DriveData {
|
||||||
|
pub name: String,
|
||||||
|
pub serial_number: Option<String>,
|
||||||
|
pub health: String,
|
||||||
|
pub temperature_celsius: Option<f32>,
|
||||||
|
pub wear_percent: Option<f32>,
|
||||||
|
pub filesystems: Vec<FilesystemData>,
|
||||||
|
pub temperature_status: Status,
|
||||||
|
pub health_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Filesystem on a drive
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct FilesystemData {
|
||||||
|
pub mount: String,
|
||||||
|
pub usage_percent: f32,
|
||||||
|
pub used_gb: f32,
|
||||||
|
pub total_gb: f32,
|
||||||
|
pub usage_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Storage pool (MergerFS, RAID, etc.)
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct PoolData {
|
||||||
|
pub name: String,
|
||||||
|
pub mount: String,
|
||||||
|
pub pool_type: String, // "mergerfs", "raid", etc.
|
||||||
|
pub health: String,
|
||||||
|
pub usage_percent: f32,
|
||||||
|
pub used_gb: f32,
|
||||||
|
pub total_gb: f32,
|
||||||
|
pub data_drives: Vec<PoolDriveData>,
|
||||||
|
pub parity_drives: Vec<PoolDriveData>,
|
||||||
|
pub health_status: Status,
|
||||||
|
pub usage_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drive in a storage pool
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct PoolDriveData {
|
||||||
|
pub name: String,
|
||||||
|
pub serial_number: Option<String>,
|
||||||
|
pub temperature_celsius: Option<f32>,
|
||||||
|
pub wear_percent: Option<f32>,
|
||||||
|
pub health: String,
|
||||||
|
pub health_status: Status,
|
||||||
|
pub temperature_status: Status,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service monitoring data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ServiceData {
|
||||||
|
pub name: String,
|
||||||
|
pub memory_mb: f32,
|
||||||
|
pub disk_gb: f32,
|
||||||
|
pub user_stopped: bool,
|
||||||
|
pub service_status: Status,
|
||||||
|
pub sub_services: Vec<SubServiceData>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sub-service data (nginx sites, docker containers, etc.)
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SubServiceData {
|
||||||
|
pub name: String,
|
||||||
|
pub service_status: Status,
|
||||||
|
pub metrics: Vec<SubServiceMetric>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Individual metric for a sub-service
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SubServiceMetric {
|
||||||
|
pub label: String,
|
||||||
|
pub value: f32,
|
||||||
|
pub unit: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Backup system data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct BackupData {
|
||||||
|
pub status: String,
|
||||||
|
pub total_size_gb: Option<f32>,
|
||||||
|
pub repository_health: Option<String>,
|
||||||
|
pub repository_disk: Option<BackupDiskData>,
|
||||||
|
pub last_backup_size_gb: Option<f32>,
|
||||||
|
pub start_time_raw: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Backup repository disk information
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct BackupDiskData {
|
||||||
|
pub serial: String,
|
||||||
|
pub usage_percent: f32,
|
||||||
|
pub used_gb: f32,
|
||||||
|
pub total_gb: f32,
|
||||||
|
pub wear_percent: Option<f32>,
|
||||||
|
pub temperature_celsius: Option<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AgentData {
|
||||||
|
/// Create new agent data with current timestamp
|
||||||
|
pub fn new(hostname: String, agent_version: String) -> Self {
|
||||||
|
Self {
|
||||||
|
hostname,
|
||||||
|
agent_version,
|
||||||
|
build_version: None,
|
||||||
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||||
|
system: SystemData {
|
||||||
|
network: NetworkData {
|
||||||
|
interfaces: Vec::new(),
|
||||||
|
},
|
||||||
|
cpu: CpuData {
|
||||||
|
load_1min: 0.0,
|
||||||
|
load_5min: 0.0,
|
||||||
|
load_15min: 0.0,
|
||||||
|
frequency_mhz: 0.0,
|
||||||
|
temperature_celsius: None,
|
||||||
|
load_status: Status::Unknown,
|
||||||
|
temperature_status: Status::Unknown,
|
||||||
|
},
|
||||||
|
memory: MemoryData {
|
||||||
|
usage_percent: 0.0,
|
||||||
|
total_gb: 0.0,
|
||||||
|
used_gb: 0.0,
|
||||||
|
available_gb: 0.0,
|
||||||
|
swap_total_gb: 0.0,
|
||||||
|
swap_used_gb: 0.0,
|
||||||
|
tmpfs: Vec::new(),
|
||||||
|
usage_status: Status::Unknown,
|
||||||
|
},
|
||||||
|
storage: StorageData {
|
||||||
|
drives: Vec::new(),
|
||||||
|
pools: Vec::new(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
services: Vec::new(),
|
||||||
|
backup: BackupData {
|
||||||
|
status: "unknown".to_string(),
|
||||||
|
total_size_gb: None,
|
||||||
|
repository_health: None,
|
||||||
|
repository_disk: None,
|
||||||
|
last_backup_size_gb: None,
|
||||||
|
start_time_raw: None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,8 +1,10 @@
|
|||||||
|
pub mod agent_data;
|
||||||
pub mod cache;
|
pub mod cache;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod protocol;
|
pub mod protocol;
|
||||||
|
|
||||||
|
pub use agent_data::*;
|
||||||
pub use cache::*;
|
pub use cache::*;
|
||||||
pub use error::*;
|
pub use error::*;
|
||||||
pub use metrics::*;
|
pub use metrics::*;
|
||||||
|
|||||||
@@ -131,6 +131,17 @@ impl HysteresisThresholds {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Evaluate value against thresholds to determine status
|
||||||
|
pub fn evaluate(&self, value: f32) -> Status {
|
||||||
|
if value >= self.critical_high {
|
||||||
|
Status::Critical
|
||||||
|
} else if value >= self.warning_high {
|
||||||
|
Status::Warning
|
||||||
|
} else {
|
||||||
|
Status::Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
|
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
|
||||||
Self {
|
Self {
|
||||||
warning_high,
|
warning_high,
|
||||||
|
|||||||
@@ -1,13 +1,9 @@
|
|||||||
use crate::metrics::Metric;
|
use crate::agent_data::AgentData;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
/// Message sent from agent to dashboard via ZMQ
|
/// Message sent from agent to dashboard via ZMQ
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
/// Always structured data - no legacy metrics support
|
||||||
pub struct MetricMessage {
|
pub type AgentMessage = AgentData;
|
||||||
pub hostname: String,
|
|
||||||
pub timestamp: u64,
|
|
||||||
pub metrics: Vec<Metric>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Command output streaming message
|
/// Command output streaming message
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@@ -20,15 +16,6 @@ pub struct CommandOutputMessage {
|
|||||||
pub timestamp: u64,
|
pub timestamp: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricMessage {
|
|
||||||
pub fn new(hostname: String, metrics: Vec<Metric>) -> Self {
|
|
||||||
Self {
|
|
||||||
hostname,
|
|
||||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
||||||
metrics,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CommandOutputMessage {
|
impl CommandOutputMessage {
|
||||||
pub fn new(hostname: String, command_id: String, command_type: String, output_line: String, is_complete: bool) -> Self {
|
pub fn new(hostname: String, command_id: String, command_type: String, output_line: String, is_complete: bool) -> Self {
|
||||||
@@ -59,8 +46,8 @@ pub enum Command {
|
|||||||
pub enum CommandResponse {
|
pub enum CommandResponse {
|
||||||
/// Acknowledgment of command
|
/// Acknowledgment of command
|
||||||
Ack,
|
Ack,
|
||||||
/// Metrics response
|
/// Agent data response
|
||||||
Metrics(Vec<Metric>),
|
AgentData(AgentData),
|
||||||
/// Pong response to ping
|
/// Pong response to ping
|
||||||
Pong,
|
Pong,
|
||||||
/// Error response
|
/// Error response
|
||||||
@@ -76,7 +63,7 @@ pub struct MessageEnvelope {
|
|||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub enum MessageType {
|
pub enum MessageType {
|
||||||
Metrics,
|
AgentData,
|
||||||
Command,
|
Command,
|
||||||
CommandResponse,
|
CommandResponse,
|
||||||
CommandOutput,
|
CommandOutput,
|
||||||
@@ -84,10 +71,10 @@ pub enum MessageType {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl MessageEnvelope {
|
impl MessageEnvelope {
|
||||||
pub fn metrics(message: MetricMessage) -> Result<Self, crate::SharedError> {
|
pub fn agent_data(data: AgentData) -> Result<Self, crate::SharedError> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
message_type: MessageType::Metrics,
|
message_type: MessageType::AgentData,
|
||||||
payload: serde_json::to_vec(&message)?,
|
payload: serde_json::to_vec(&data)?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -119,11 +106,11 @@ impl MessageEnvelope {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode_metrics(&self) -> Result<MetricMessage, crate::SharedError> {
|
pub fn decode_agent_data(&self) -> Result<AgentData, crate::SharedError> {
|
||||||
match self.message_type {
|
match self.message_type {
|
||||||
MessageType::Metrics => Ok(serde_json::from_slice(&self.payload)?),
|
MessageType::AgentData => Ok(serde_json::from_slice(&self.payload)?),
|
||||||
_ => Err(crate::SharedError::Protocol {
|
_ => Err(crate::SharedError::Protocol {
|
||||||
message: "Expected metrics message".to_string(),
|
message: "Expected agent data message".to_string(),
|
||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user