Compare commits
299 Commits
d349e2742d
...
v0.1.184
| Author | SHA1 | Date | |
|---|---|---|---|
| 7a68da01f5 | |||
| 5be67fed64 | |||
| cac836601b | |||
| bd22ce265b | |||
| bbc8b7b1cb | |||
| 5dd8cadef3 | |||
| fefe30ec51 | |||
| fb40cce748 | |||
| eaa057b284 | |||
| f23a1b5cec | |||
| 3f98f68b51 | |||
| 3d38a7a984 | |||
| b0ee0242bd | |||
| 8f9e9eabca | |||
| 937f4ad427 | |||
| 8aefab83ae | |||
| 748a9f3a3b | |||
| 5c6b11c794 | |||
| 9f0aa5f806 | |||
| fc247bd0ad | |||
| 00fe8c28ab | |||
| fbbb4a4cfb | |||
| 53e1d8bbce | |||
| 1b9fecea98 | |||
| b7ffeaced5 | |||
| 3858309a5d | |||
| df104bf940 | |||
| d5ce36ee18 | |||
| 4f80701671 | |||
| 267654fda4 | |||
| dc1105eefe | |||
| c9d12793ef | |||
| 8f80015273 | |||
| 7a95a9d762 | |||
| 7b11db990c | |||
| 67b59e9551 | |||
| da37e28b6a | |||
| d89b3ac881 | |||
| 7f26991609 | |||
| 75ec190b93 | |||
| eb892096d9 | |||
| c006625a3f | |||
| dcd5fff8c1 | |||
| 9357e5f2a8 | |||
| d164c1da5f | |||
| b120f95f8a | |||
| 66ab7a492d | |||
| 4d615a7f45 | |||
| fd7ad23205 | |||
| 2b2cb2da3e | |||
| 11d1c2dc94 | |||
| bea2d120b5 | |||
| 5394164123 | |||
| 4329cd26e0 | |||
| b85bd6b153 | |||
| c9b2d5e342 | |||
| b2b301332f | |||
| adf3b0f51c | |||
| 41ded0170c | |||
| 9b4191b2c3 | |||
| 53dbb43352 | |||
| ba03623110 | |||
| f24c4ed650 | |||
| 86501fd486 | |||
| 192eea6e0c | |||
| 43fb838c9b | |||
| 54483653f9 | |||
| e47803b705 | |||
| 439d0d9af6 | |||
| 2242b5ddfe | |||
| 9d0f42d55c | |||
| 1da7b5f6e7 | |||
| 006f27f7d9 | |||
| 07422cd0a7 | |||
| de30b80219 | |||
| 7d96ca9fad | |||
| 9b940ebd19 | |||
| 6d4da1b7da | |||
| 1e7f1616aa | |||
| 7a3ee3d5ba | |||
| 0e8b149718 | |||
| 2c27d0e1db | |||
| 9f18488752 | |||
| fab6404cca | |||
| c3626cc362 | |||
| d68ecfbc64 | |||
| d1272a6c13 | |||
| 33b3beb342 | |||
| f9384d9df6 | |||
| 156d707377 | |||
| dc1a2e3a0f | |||
| 5d6b8e6253 | |||
| 0cba083305 | |||
| a6be7a4788 | |||
| 2384f7f9b9 | |||
| cd5ef65d3d | |||
| 7bf9ca6201 | |||
| f587b42797 | |||
| 7ae464e172 | |||
| 980c9a20a2 | |||
| 448a38dede | |||
| f12e20b0f3 | |||
| 564d1f37e7 | |||
| 65bfb9f617 | |||
| 4f4ef6259b | |||
| 505263cec6 | |||
| 61dd686fb9 | |||
| c0f7a97a6f | |||
| 9575077045 | |||
| 34a1f7b9dc | |||
| d11aa11f99 | |||
| 0ca06d2507 | |||
| 6693f3a05f | |||
| de252d27b9 | |||
| db0e41a7d3 | |||
| ec460496d8 | |||
| 33e700529e | |||
| d644b7d40a | |||
| f635ba9c75 | |||
| 76b6e3373e | |||
| 0a13cab897 | |||
| d33ec5d225 | |||
| d31c2384df | |||
| c8db463204 | |||
| e8e50ef9bb | |||
| 0faed9309e | |||
| c980346d05 | |||
| 3e3d3f0c2b | |||
| 9eb7444d56 | |||
| 278d1763aa | |||
| f874264e13 | |||
| 5f6e47ece5 | |||
| 0e7cf24dbb | |||
| 2d080a2f51 | |||
| 6179bd51a7 | |||
| 57de4c366a | |||
| e18778e962 | |||
| e4469a0ebf | |||
| 6fedf4c7fc | |||
| 3f6dffa66e | |||
| 1b64fbde3d | |||
| 4f4c3b0d6e | |||
| bd20f0cae1 | |||
| 11c9a5f9d2 | |||
| aeae60146d | |||
| a82c81e8e3 | |||
| c56e9d7be2 | |||
| c8f800a1e5 | |||
| fc6b3424cf | |||
| 35e06c6734 | |||
| 783d233319 | |||
| 6509a2b91a | |||
| 52f8c40b86 | |||
| a86b5ba8f9 | |||
| 1b964545be | |||
| 97aa1708c2 | |||
| d12689f3b5 | |||
| f22e3ee95e | |||
| e890c5e810 | |||
| 078c30a592 | |||
| a847674004 | |||
| 2618f6b62f | |||
| c3fc5a181d | |||
| 3f45a172b3 | |||
| 5b12c12228 | |||
| 651b801de3 | |||
| 71b9f93d7c | |||
| ae70946c61 | |||
| 2910b7d875 | |||
| 43242debce | |||
| a2519b2814 | |||
| 91f037aa3e | |||
| 627c533724 | |||
| b1bff4857b | |||
| f8a061d496 | |||
| e61a845965 | |||
| ac5d2d4db5 | |||
| 69892a2d84 | |||
| a928d73134 | |||
| af52d49194 | |||
| bc94f75328 | |||
| b6da71b7e7 | |||
| aaf7edfbce | |||
| bb72c42726 | |||
| af5f96ce2f | |||
| 8dffe18a23 | |||
| 0c544753f9 | |||
| c8e26b9bac | |||
| 60ef712fac | |||
| 1ed4666dfd | |||
| 59d260680e | |||
| 9160fac80b | |||
| 83cb43bcf1 | |||
| b310206f1f | |||
| f9bf3ce610 | |||
| 5f8c933844 | |||
| e61fd7fd76 | |||
| 64ceed6236 | |||
| 09dcd53da5 | |||
| 43196af70c | |||
| 1b3f8671c0 | |||
| 16ea853f5b | |||
| d463272cf2 | |||
| 17b5921d8d | |||
| 3d187c9220 | |||
| 4b54a59e35 | |||
| 8dd943e8f1 | |||
| fb6ee6d7ae | |||
| a7e237e2ff | |||
| c48a105c28 | |||
| 71671a8901 | |||
| f5d2ebeaec | |||
| 2d3844b5dd | |||
| 996a199050 | |||
| a991fbb942 | |||
| 7b7e323fd8 | |||
| 114ad52ae8 | |||
| 8978356c49 | |||
| b3c67f4b7f | |||
| 864cafd61f | |||
| 6a1324ba6c | |||
| ab28382d58 | |||
| 9df6106bf5 | |||
| 967244064f | |||
| 99da289183 | |||
| b0b1ea04a1 | |||
| b8afd15417 | |||
| 61287380d3 | |||
| 999e7b5db5 | |||
| c851590aaa | |||
| 6b18cdf562 | |||
| 1b46aa2f13 | |||
| 8cb5650fbb | |||
| 51375e8020 | |||
| 65479c14af | |||
| ecee256f91 | |||
| b391448d33 | |||
| 997b30a9c0 | |||
| d193b90ba1 | |||
| ad298ac70c | |||
| 9f34c67bfa | |||
| 5134c5320a | |||
| 7f5949b818 | |||
| 473f89fb57 | |||
| d0ce1726e8 | |||
| c5ec529210 | |||
| 4193a97737 | |||
| ef9c5b6cf1 | |||
| 84e21dc79a | |||
| 1e5f8d6111 | |||
| 3b1bda741b | |||
| 64af24dc40 | |||
| df036e90dc | |||
| 9e80d6b654 | |||
| 39fc9cd22f | |||
| c99e0bd8ee | |||
| 0f12438ab4 | |||
| 7607e971b8 | |||
| da6f3c3855 | |||
| 174b27f31a | |||
| dc11538ae9 | |||
| 9133e18090 | |||
| 616fad2c5d | |||
| 7bb5c1cf84 | |||
| 245e546f18 | |||
| 14aae90954 | |||
| 52d630a2e5 | |||
| b1f294cf2f | |||
| 1591565b1b | |||
| 08d3454683 | |||
| a6c2983f65 | |||
| 3d2b37b26c | |||
| a6d2a2f086 | |||
| 1315ba1315 | |||
| 0417e2c1f1 | |||
| a08670071c | |||
| 338c4457a5 | |||
| f4b5bb814d | |||
| 7ead8ee98a | |||
| 34822bd835 | |||
| 98afb19945 | |||
| d80f2ce811 | |||
| 89afd9143f | |||
| 98e3ecb0ea | |||
| 41208aa2a0 | |||
| a937032eb1 | |||
| 1e8da8c187 | |||
| 1cc31ec26a | |||
| b580cfde8c | |||
| 5886426dac | |||
| eb268922bd | |||
| 049ac53629 | |||
| 00a8ed3da2 | |||
| e998679901 | |||
| 2ccfc4256a | |||
| 11be496a26 | |||
| 66a79574e0 | |||
| ecaf3aedb5 | |||
| 959745b51b |
128
.gitea/workflows/release.yml
Normal file
128
.gitea/workflows/release.yml
Normal file
@@ -0,0 +1,128 @@
|
||||
name: Build and Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: 'Version to release (e.g., v0.1.0)'
|
||||
required: true
|
||||
default: 'v0.1.0'
|
||||
|
||||
jobs:
|
||||
build-and-release:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y pkg-config libssl-dev libzmq3-dev
|
||||
|
||||
- name: Build workspace (static)
|
||||
run: |
|
||||
export RUSTFLAGS="-C target-feature=+crt-static"
|
||||
cargo build --release --workspace --target x86_64-unknown-linux-gnu
|
||||
|
||||
- name: Create release directory
|
||||
run: |
|
||||
mkdir -p release
|
||||
cp target/x86_64-unknown-linux-gnu/release/cm-dashboard release/cm-dashboard-linux-x86_64
|
||||
cp target/x86_64-unknown-linux-gnu/release/cm-dashboard-agent release/cm-dashboard-agent-linux-x86_64
|
||||
|
||||
- name: Create tarball
|
||||
run: |
|
||||
cd release
|
||||
tar -czf cm-dashboard-linux-x86_64.tar.gz cm-dashboard-linux-x86_64 cm-dashboard-agent-linux-x86_64
|
||||
|
||||
- name: Set version variable
|
||||
id: version
|
||||
run: |
|
||||
if [ "${{ gitea.event_name }}" == "workflow_dispatch" ]; then
|
||||
echo "VERSION=${{ gitea.event.inputs.version }}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Create Release with curl
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||
run: |
|
||||
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||
|
||||
# Create release
|
||||
curl -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"tag_name": "'$VERSION'",
|
||||
"name": "cm-dashboard '$VERSION'",
|
||||
"body": "## cm-dashboard '$VERSION'\n\nPre-built binaries for Linux x86_64:\n- cm-dashboard-linux-x86_64 - Dashboard TUI binary\n- cm-dashboard-agent-linux-x86_64 - Agent daemon binary\n- cm-dashboard-linux-x86_64.tar.gz - Combined tarball"
|
||||
}' \
|
||||
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases"
|
||||
|
||||
# Get release ID
|
||||
RELEASE_ID=$(curl -s -H "Authorization: token $GITEA_TOKEN" \
|
||||
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/tags/$VERSION" | \
|
||||
grep -o '"id":[0-9]*' | head -1 | cut -d':' -f2)
|
||||
|
||||
# Upload binaries
|
||||
curl -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-F "attachment=@release/cm-dashboard-linux-x86_64" \
|
||||
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/$RELEASE_ID/assets?name=cm-dashboard-linux-x86_64"
|
||||
|
||||
curl -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-F "attachment=@release/cm-dashboard-agent-linux-x86_64" \
|
||||
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/$RELEASE_ID/assets?name=cm-dashboard-agent-linux-x86_64"
|
||||
|
||||
curl -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-F "attachment=@release/cm-dashboard-linux-x86_64.tar.gz" \
|
||||
"https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/releases/$RELEASE_ID/assets?name=cm-dashboard-linux-x86_64.tar.gz"
|
||||
|
||||
- name: Update NixOS Configuration
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
|
||||
run: |
|
||||
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||
|
||||
# Clone nixosbox repository
|
||||
git clone https://$GITEA_TOKEN@gitea.cmtec.se/cm/nixosbox.git nixosbox-update
|
||||
cd nixosbox-update
|
||||
|
||||
# Get hash for the new release tarball
|
||||
TARBALL_URL="https://gitea.cmtec.se/cm/cm-dashboard/releases/download/$VERSION/cm-dashboard-linux-x86_64.tar.gz"
|
||||
|
||||
# Download tarball to get correct hash
|
||||
curl -L -o cm-dashboard.tar.gz "$TARBALL_URL"
|
||||
# Convert sha256 hex to base64 for Nix hash format using Python
|
||||
NEW_HASH=$(sha256sum cm-dashboard.tar.gz | cut -d' ' -f1)
|
||||
NIX_HASH="sha256-$(python3 -c "import base64, binascii; print(base64.b64encode(binascii.unhexlify('$NEW_HASH')).decode())")"
|
||||
|
||||
# Update the NixOS configuration
|
||||
sed -i "s|version = \"v[^\"]*\"|version = \"$VERSION\"|" services/cm-dashboard.nix
|
||||
sed -i "s|sha256 = \"sha256-[^\"]*\"|sha256 = \"$NIX_HASH\"|" services/cm-dashboard.nix
|
||||
|
||||
# Commit and push changes
|
||||
git config user.name "Gitea Actions"
|
||||
git config user.email "actions@gitea.cmtec.se"
|
||||
git add services/cm-dashboard.nix
|
||||
git commit -m "Auto-update cm-dashboard to $VERSION
|
||||
|
||||
- Update version to $VERSION with automated release
|
||||
- Update tarball hash for new static binaries
|
||||
- Automated update from cm-dashboard release workflow"
|
||||
git push
|
||||
@@ -1,3 +0,0 @@
|
||||
# Agent Guide
|
||||
|
||||
Agents working in this repo must follow the instructions in `CLAUDE.md`.
|
||||
853
ARCHITECT.md
853
ARCHITECT.md
@@ -1,853 +0,0 @@
|
||||
# CM Dashboard Agent Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
This document defines the architecture for the CM Dashboard Agent. The agent collects individual metrics and sends them to the dashboard via ZMQ. The dashboard decides which metrics to use in which widgets.
|
||||
|
||||
## Core Philosophy
|
||||
|
||||
**Individual Metrics Approach**: The agent collects and transmits individual metrics (e.g., `cpu_load_1min`, `memory_usage_percent`, `backup_last_run`) rather than grouped metric structures. This provides maximum flexibility for dashboard widget composition.
|
||||
|
||||
## Folder Structure
|
||||
|
||||
```
|
||||
cm-dashboard/
|
||||
├── agent/ # Agent application
|
||||
│ ├── Cargo.toml
|
||||
│ ├── src/
|
||||
│ │ ├── main.rs # Entry point with CLI parsing
|
||||
│ │ ├── agent.rs # Main Agent orchestrator
|
||||
│ │ ├── config/
|
||||
│ │ │ ├── mod.rs # Configuration module exports
|
||||
│ │ │ ├── loader.rs # TOML configuration loading
|
||||
│ │ │ ├── defaults.rs # Default configuration values
|
||||
│ │ │ └── validation.rs # Configuration validation
|
||||
│ │ ├── communication/
|
||||
│ │ │ ├── mod.rs # Communication module exports
|
||||
│ │ │ ├── zmq_config.rs # ZMQ configuration structures
|
||||
│ │ │ ├── zmq_handler.rs # ZMQ socket management
|
||||
│ │ │ ├── protocol.rs # Message format definitions
|
||||
│ │ │ └── error.rs # Communication errors
|
||||
│ │ ├── metrics/
|
||||
│ │ │ ├── mod.rs # Metrics module exports
|
||||
│ │ │ ├── registry.rs # Metric name registry and types
|
||||
│ │ │ ├── value.rs # Metric value types and status
|
||||
│ │ │ ├── cache.rs # Individual metric caching
|
||||
│ │ │ └── collection.rs # Metric collection storage
|
||||
│ │ ├── collectors/
|
||||
│ │ │ ├── mod.rs # Collector trait definition
|
||||
│ │ │ ├── cpu.rs # CPU-related metrics
|
||||
│ │ │ ├── memory.rs # Memory-related metrics
|
||||
│ │ │ ├── disk.rs # Disk usage metrics
|
||||
│ │ │ ├── processes.rs # Process-related metrics
|
||||
│ │ │ ├── systemd.rs # Systemd service metrics
|
||||
│ │ │ ├── smart.rs # Storage SMART metrics
|
||||
│ │ │ ├── backup.rs # Backup status metrics
|
||||
│ │ │ ├── network.rs # Network metrics
|
||||
│ │ │ └── error.rs # Collector errors
|
||||
│ │ ├── notifications/
|
||||
│ │ │ ├── mod.rs # Notification exports
|
||||
│ │ │ ├── manager.rs # Status change detection
|
||||
│ │ │ ├── email.rs # Email notification backend
|
||||
│ │ │ └── status_tracker.rs # Individual metric status tracking
|
||||
│ │ └── utils/
|
||||
│ │ ├── mod.rs # Utility exports
|
||||
│ │ ├── system.rs # System command utilities
|
||||
│ │ ├── time.rs # Timestamp utilities
|
||||
│ │ └── discovery.rs # Auto-discovery functions
|
||||
│ ├── config/
|
||||
│ │ ├── agent.example.toml # Example configuration
|
||||
│ │ └── production.toml # Production template
|
||||
│ └── tests/
|
||||
│ ├── integration/ # Integration tests
|
||||
│ ├── unit/ # Unit tests by module
|
||||
│ └── fixtures/ # Test data and mocks
|
||||
├── dashboard/ # Dashboard application
|
||||
│ ├── Cargo.toml
|
||||
│ ├── src/
|
||||
│ │ ├── main.rs # Entry point with CLI parsing
|
||||
│ │ ├── app.rs # Main Dashboard application state
|
||||
│ │ ├── config/
|
||||
│ │ │ ├── mod.rs # Configuration module exports
|
||||
│ │ │ ├── loader.rs # TOML configuration loading
|
||||
│ │ │ └── defaults.rs # Default configuration values
|
||||
│ │ ├── communication/
|
||||
│ │ │ ├── mod.rs # Communication module exports
|
||||
│ │ │ ├── zmq_consumer.rs # ZMQ metric consumer
|
||||
│ │ │ ├── protocol.rs # Shared message protocol
|
||||
│ │ │ └── error.rs # Communication errors
|
||||
│ │ ├── metrics/
|
||||
│ │ │ ├── mod.rs # Metrics module exports
|
||||
│ │ │ ├── store.rs # Metric storage and retrieval
|
||||
│ │ │ ├── filter.rs # Metric filtering and selection
|
||||
│ │ │ ├── history.rs # Historical metric storage
|
||||
│ │ │ └── subscription.rs # Metric subscription management
|
||||
│ │ ├── ui/
|
||||
│ │ │ ├── mod.rs # UI module exports
|
||||
│ │ │ ├── app.rs # Main UI application loop
|
||||
│ │ │ ├── layout.rs # Layout management
|
||||
│ │ │ ├── widgets/
|
||||
│ │ │ │ ├── mod.rs # Widget exports
|
||||
│ │ │ │ ├── base.rs # Base widget trait
|
||||
│ │ │ │ ├── cpu.rs # CPU metrics widget
|
||||
│ │ │ │ ├── memory.rs # Memory metrics widget
|
||||
│ │ │ │ ├── storage.rs # Storage metrics widget
|
||||
│ │ │ │ ├── services.rs # Services metrics widget
|
||||
│ │ │ │ ├── backup.rs # Backup metrics widget
|
||||
│ │ │ │ ├── hosts.rs # Host selection widget
|
||||
│ │ │ │ └── alerts.rs # Alerts/status widget
|
||||
│ │ │ ├── theme.rs # UI theming and colors
|
||||
│ │ │ └── input.rs # Input handling
|
||||
│ │ ├── hosts/
|
||||
│ │ │ ├── mod.rs # Host management exports
|
||||
│ │ │ ├── manager.rs # Host connection management
|
||||
│ │ │ ├── discovery.rs # Host auto-discovery
|
||||
│ │ │ └── connection.rs # Individual host connections
|
||||
│ │ └── utils/
|
||||
│ │ ├── mod.rs # Utility exports
|
||||
│ │ ├── formatting.rs # Data formatting utilities
|
||||
│ │ └── time.rs # Time formatting utilities
|
||||
│ ├── config/
|
||||
│ │ ├── dashboard.example.toml # Example configuration
|
||||
│ │ └── hosts.example.toml # Example host configuration
|
||||
│ └── tests/
|
||||
│ ├── integration/ # Integration tests
|
||||
│ ├── unit/ # Unit tests by module
|
||||
│ └── fixtures/ # Test data and mocks
|
||||
├── shared/ # Shared types and utilities
|
||||
│ ├── Cargo.toml
|
||||
│ ├── src/
|
||||
│ │ ├── lib.rs # Shared library exports
|
||||
│ │ ├── protocol.rs # Shared message protocol
|
||||
│ │ ├── metrics.rs # Shared metric types
|
||||
│ │ └── error.rs # Shared error types
|
||||
└── tests/ # End-to-end tests
|
||||
├── e2e/ # End-to-end test scenarios
|
||||
└── fixtures/ # Shared test data
|
||||
```
|
||||
|
||||
## Architecture Principles
|
||||
|
||||
### 1. Individual Metrics Philosophy
|
||||
|
||||
**No Grouped Structures**: Instead of `SystemMetrics` or `BackupMetrics`, we collect individual metrics:
|
||||
|
||||
```rust
|
||||
// Good - Individual metrics
|
||||
"cpu_load_1min" -> 2.5
|
||||
"cpu_load_5min" -> 2.8
|
||||
"cpu_temperature" -> 45.0
|
||||
"memory_usage_percent" -> 78.5
|
||||
"memory_total_gb" -> 32.0
|
||||
"disk_root_usage_percent" -> 15.2
|
||||
"service_ssh_status" -> "active"
|
||||
"backup_last_run_timestamp" -> 1697123456
|
||||
|
||||
// Bad - Grouped structures
|
||||
SystemMetrics { cpu: {...}, memory: {...} }
|
||||
```
|
||||
|
||||
**Dashboard Flexibility**: The dashboard consumes individual metrics and decides which ones to display in each widget.
|
||||
|
||||
### 2. Metric Definition
|
||||
|
||||
Each metric has:
|
||||
- **Name**: Unique identifier (e.g., `cpu_load_1min`)
|
||||
- **Value**: Typed value (f32, i64, String, bool)
|
||||
- **Status**: Health status (ok, warning, critical, unknown)
|
||||
- **Timestamp**: When the metric was collected
|
||||
- **Metadata**: Optional description, units, etc.
|
||||
|
||||
### 3. Module Responsibilities
|
||||
|
||||
- **Communication**: ZMQ protocol and message handling
|
||||
- **Metrics**: Value types, caching, and storage
|
||||
- **Collectors**: Gather specific metrics from system
|
||||
- **Notifications**: Track status changes across all metrics
|
||||
- **Config**: Configuration loading and validation
|
||||
|
||||
### 4. Data Flow
|
||||
|
||||
```
|
||||
Collectors → Individual Metrics → Cache → ZMQ → Dashboard
|
||||
↓ ↓ ↓
|
||||
Status Calc → Status Tracker → Notifications
|
||||
```
|
||||
|
||||
## Metric Design Rules
|
||||
|
||||
### 1. Naming Convention
|
||||
|
||||
Metrics follow hierarchical naming:
|
||||
|
||||
```
|
||||
{category}_{subcategory}_{property}_{unit}
|
||||
|
||||
Examples:
|
||||
cpu_load_1min
|
||||
cpu_temperature_celsius
|
||||
memory_usage_percent
|
||||
memory_total_gb
|
||||
disk_root_usage_percent
|
||||
disk_nvme0_temperature_celsius
|
||||
service_ssh_status
|
||||
service_ssh_memory_mb
|
||||
backup_last_run_timestamp
|
||||
backup_status
|
||||
network_eth0_rx_bytes
|
||||
```
|
||||
|
||||
### 2. Value Types
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum MetricValue {
|
||||
Float(f32),
|
||||
Integer(i64),
|
||||
String(String),
|
||||
Boolean(bool),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum Status {
|
||||
Ok,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Metric {
|
||||
pub name: String,
|
||||
pub value: MetricValue,
|
||||
pub status: Status,
|
||||
pub timestamp: u64,
|
||||
pub description: Option<String>,
|
||||
pub unit: Option<String>,
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Collector Interface
|
||||
|
||||
Each collector provides individual metrics:
|
||||
|
||||
```rust
|
||||
#[async_trait]
|
||||
pub trait Collector {
|
||||
fn name(&self) -> &str;
|
||||
async fn collect(&self) -> Result<Vec<Metric>>;
|
||||
}
|
||||
|
||||
// Example CPU collector output:
|
||||
vec![
|
||||
Metric { name: "cpu_load_1min", value: Float(2.5), status: Ok, ... },
|
||||
Metric { name: "cpu_load_5min", value: Float(2.8), status: Ok, ... },
|
||||
Metric { name: "cpu_temperature", value: Float(45.0), status: Ok, ... },
|
||||
]
|
||||
```
|
||||
|
||||
## Communication Protocol
|
||||
|
||||
### ZMQ Message Format
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct MetricMessage {
|
||||
pub hostname: String,
|
||||
pub timestamp: u64,
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
```
|
||||
|
||||
### ZMQ Configuration
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ZmqConfig {
|
||||
pub publisher_port: u16, // Default: 6130
|
||||
pub command_port: u16, // Default: 6131
|
||||
pub bind_address: String, // Default: "0.0.0.0"
|
||||
pub timeout_ms: u64, // Default: 5000
|
||||
pub heartbeat_interval: u64, // Default: 30000
|
||||
}
|
||||
```
|
||||
|
||||
## Caching Strategy
|
||||
|
||||
### Configuration-Based Individual Metric Cache
|
||||
|
||||
```rust
|
||||
pub struct MetricCache {
|
||||
cache: HashMap<String, CachedMetric>,
|
||||
config: CacheConfig,
|
||||
}
|
||||
|
||||
struct CachedMetric {
|
||||
metric: Metric,
|
||||
collected_at: Instant,
|
||||
access_count: u64,
|
||||
cache_tier: CacheTier,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct CacheConfig {
|
||||
pub enabled: bool,
|
||||
pub default_ttl_seconds: u64,
|
||||
pub max_entries: usize,
|
||||
pub metric_tiers: HashMap<String, CacheTier>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct CacheTier {
|
||||
pub interval_seconds: u64,
|
||||
pub description: String,
|
||||
}
|
||||
```
|
||||
|
||||
**Configuration-Based Caching Rules**:
|
||||
- Each metric type has configurable cache intervals via config files
|
||||
- Cache tiers defined in configuration, not hardcoded
|
||||
- Individual metrics cached by name with tier-specific TTL
|
||||
- Cache miss triggers single metric collection
|
||||
- No grouped cache invalidation
|
||||
- Performance target: <2% CPU usage through intelligent caching
|
||||
|
||||
## Configuration System
|
||||
|
||||
### Configuration Structure
|
||||
|
||||
```toml
|
||||
[zmq]
|
||||
publisher_port = 6130
|
||||
command_port = 6131
|
||||
bind_address = "0.0.0.0"
|
||||
timeout_ms = 5000
|
||||
|
||||
[cache]
|
||||
enabled = true
|
||||
default_ttl_seconds = 30
|
||||
max_entries = 10000
|
||||
|
||||
# Cache tiers for different metric types
|
||||
[cache.tiers.realtime]
|
||||
interval_seconds = 5
|
||||
description = "High-frequency metrics (CPU load, memory usage)"
|
||||
|
||||
[cache.tiers.fast]
|
||||
interval_seconds = 30
|
||||
description = "Medium-frequency metrics (network stats, process lists)"
|
||||
|
||||
[cache.tiers.medium]
|
||||
interval_seconds = 300
|
||||
description = "Low-frequency metrics (service status, disk usage)"
|
||||
|
||||
[cache.tiers.slow]
|
||||
interval_seconds = 900
|
||||
description = "Very low-frequency metrics (SMART data, backup status)"
|
||||
|
||||
[cache.tiers.static]
|
||||
interval_seconds = 3600
|
||||
description = "Rarely changing metrics (hardware info, system capabilities)"
|
||||
|
||||
# Metric type to tier mapping
|
||||
[cache.metric_assignments]
|
||||
"cpu_load_*" = "realtime"
|
||||
"memory_usage_*" = "realtime"
|
||||
"service_*_cpu_percent" = "realtime"
|
||||
"service_*_memory_mb" = "realtime"
|
||||
"service_*_status" = "medium"
|
||||
"service_*_disk_gb" = "medium"
|
||||
"disk_*_temperature" = "slow"
|
||||
"disk_*_wear_percent" = "slow"
|
||||
"backup_*" = "slow"
|
||||
"network_*" = "fast"
|
||||
|
||||
[collectors.cpu]
|
||||
enabled = true
|
||||
interval_seconds = 5
|
||||
temperature_warning = 70.0
|
||||
temperature_critical = 80.0
|
||||
load_warning = 5.0
|
||||
load_critical = 8.0
|
||||
|
||||
[collectors.memory]
|
||||
enabled = true
|
||||
interval_seconds = 5
|
||||
usage_warning_percent = 80.0
|
||||
usage_critical_percent = 95.0
|
||||
|
||||
[collectors.systemd]
|
||||
enabled = true
|
||||
interval_seconds = 30
|
||||
services = ["ssh", "nginx", "docker", "gitea"]
|
||||
|
||||
[notifications]
|
||||
enabled = true
|
||||
smtp_host = "localhost"
|
||||
smtp_port = 25
|
||||
from_email = "{{hostname}}@cmtec.se"
|
||||
to_email = "cm@cmtec.se"
|
||||
rate_limit_minutes = 30
|
||||
```
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### 1. Adding New Metrics
|
||||
|
||||
```rust
|
||||
// 1. Define metric names in registry
|
||||
pub const NETWORK_ETH0_RX_BYTES: &str = "network_eth0_rx_bytes";
|
||||
pub const NETWORK_ETH0_TX_BYTES: &str = "network_eth0_tx_bytes";
|
||||
|
||||
// 2. Implement collector
|
||||
pub struct NetworkCollector {
|
||||
config: NetworkConfig,
|
||||
}
|
||||
|
||||
impl Collector for NetworkCollector {
|
||||
async fn collect(&self) -> Result<Vec<Metric>> {
|
||||
vec![
|
||||
Metric {
|
||||
name: NETWORK_ETH0_RX_BYTES.to_string(),
|
||||
value: MetricValue::Integer(rx_bytes),
|
||||
status: Status::Ok,
|
||||
timestamp: now(),
|
||||
unit: Some("bytes".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
// ... more metrics
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Register in agent
|
||||
agent.register_collector(Box::new(NetworkCollector::new(config.network)));
|
||||
```
|
||||
|
||||
### 2. Status Calculation
|
||||
|
||||
Each collector calculates status for its metrics:
|
||||
|
||||
```rust
|
||||
impl CpuCollector {
|
||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||
if temp >= self.config.critical_threshold {
|
||||
Status::Critical
|
||||
} else if temp >= self.config.warning_threshold {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Dashboard Usage
|
||||
|
||||
Dashboard widgets subscribe to specific metrics:
|
||||
|
||||
```rust
|
||||
// Dashboard CPU widget
|
||||
let cpu_metrics = [
|
||||
"cpu_load_1min",
|
||||
"cpu_load_5min",
|
||||
"cpu_load_15min",
|
||||
"cpu_temperature",
|
||||
];
|
||||
|
||||
// Dashboard memory widget
|
||||
let memory_metrics = [
|
||||
"memory_usage_percent",
|
||||
"memory_total_gb",
|
||||
"memory_available_gb",
|
||||
];
|
||||
```
|
||||
|
||||
# Dashboard Architecture
|
||||
|
||||
## Dashboard Principles
|
||||
|
||||
### 1. UI Layout Preservation
|
||||
|
||||
**Current UI Layout Maintained**: The existing dashboard UI layout is preserved and enhanced with the new metric-centric architecture. All current widgets remain in their established positions and functionality.
|
||||
|
||||
**Widget Enhancement, Not Replacement**: Widgets are enhanced to consume individual metrics rather than grouped structures, but maintain their visual appearance and user interaction patterns.
|
||||
|
||||
### 2. Metric-to-Widget Mapping
|
||||
|
||||
Each widget subscribes to specific individual metrics and composes them for display:
|
||||
|
||||
```rust
|
||||
// CPU Widget Metrics
|
||||
const CPU_WIDGET_METRICS: &[&str] = &[
|
||||
"cpu_load_1min",
|
||||
"cpu_load_5min",
|
||||
"cpu_load_15min",
|
||||
"cpu_temperature_celsius",
|
||||
"cpu_frequency_mhz",
|
||||
"cpu_usage_percent",
|
||||
];
|
||||
|
||||
// Memory Widget Metrics
|
||||
const MEMORY_WIDGET_METRICS: &[&str] = &[
|
||||
"memory_usage_percent",
|
||||
"memory_total_gb",
|
||||
"memory_available_gb",
|
||||
"memory_used_gb",
|
||||
"memory_swap_total_gb",
|
||||
"memory_swap_used_gb",
|
||||
];
|
||||
|
||||
// Storage Widget Metrics
|
||||
const STORAGE_WIDGET_METRICS: &[&str] = &[
|
||||
"disk_nvme0_temperature_celsius",
|
||||
"disk_nvme0_wear_percent",
|
||||
"disk_nvme0_spare_percent",
|
||||
"disk_nvme0_hours",
|
||||
"disk_nvme0_capacity_gb",
|
||||
"disk_nvme0_usage_gb",
|
||||
"disk_nvme0_usage_percent",
|
||||
];
|
||||
|
||||
// Services Widget Metrics
|
||||
const SERVICES_WIDGET_METRICS: &[&str] = &[
|
||||
"service_ssh_status",
|
||||
"service_ssh_memory_mb",
|
||||
"service_ssh_cpu_percent",
|
||||
"service_nginx_status",
|
||||
"service_nginx_memory_mb",
|
||||
"service_docker_status",
|
||||
// ... per discovered service
|
||||
];
|
||||
|
||||
// Backup Widget Metrics
|
||||
const BACKUP_WIDGET_METRICS: &[&str] = &[
|
||||
"backup_last_run_timestamp",
|
||||
"backup_status",
|
||||
"backup_size_gb",
|
||||
"backup_duration_minutes",
|
||||
"backup_next_scheduled_timestamp",
|
||||
];
|
||||
```
|
||||
|
||||
## Dashboard Communication
|
||||
|
||||
### ZMQ Consumer Architecture
|
||||
|
||||
```rust
|
||||
// dashboard/src/communication/zmq_consumer.rs
|
||||
pub struct ZmqConsumer {
|
||||
subscriber: Socket,
|
||||
config: ZmqConfig,
|
||||
metric_filter: MetricFilter,
|
||||
}
|
||||
|
||||
impl ZmqConsumer {
|
||||
pub async fn subscribe_to_host(&mut self, hostname: &str) -> Result<()>
|
||||
pub async fn receive_metrics(&mut self) -> Result<Vec<Metric>>
|
||||
pub fn set_metric_filter(&mut self, filter: MetricFilter)
|
||||
pub async fn request_metrics(&self, metric_names: &[String]) -> Result<()>
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricFilter {
|
||||
pub include_patterns: Vec<String>,
|
||||
pub exclude_patterns: Vec<String>,
|
||||
pub hosts: Vec<String>,
|
||||
}
|
||||
```
|
||||
|
||||
### Protocol Compatibility
|
||||
|
||||
The dashboard uses the same protocol as defined in the agent:
|
||||
|
||||
```rust
|
||||
// shared/src/protocol.rs (shared between agent and dashboard)
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct MetricMessage {
|
||||
pub hostname: String,
|
||||
pub timestamp: u64,
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Metric {
|
||||
pub name: String,
|
||||
pub value: MetricValue,
|
||||
pub status: Status,
|
||||
pub timestamp: u64,
|
||||
pub description: Option<String>,
|
||||
pub unit: Option<String>,
|
||||
}
|
||||
```
|
||||
|
||||
## Dashboard Metric Management
|
||||
|
||||
### Metric Store
|
||||
|
||||
```rust
|
||||
// dashboard/src/metrics/store.rs
|
||||
pub struct MetricStore {
|
||||
current_metrics: HashMap<String, HashMap<String, Metric>>, // host -> metric_name -> metric
|
||||
historical_metrics: HistoricalStore,
|
||||
subscriptions: SubscriptionManager,
|
||||
}
|
||||
|
||||
impl MetricStore {
|
||||
pub fn update_metrics(&mut self, hostname: &str, metrics: Vec<Metric>)
|
||||
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric>
|
||||
pub fn get_metrics_for_widget(&self, hostname: &str, widget: WidgetType) -> Vec<&Metric>
|
||||
pub fn get_hosts(&self) -> Vec<String>
|
||||
pub fn get_latest_timestamp(&self, hostname: &str) -> Option<u64>
|
||||
}
|
||||
```
|
||||
|
||||
### Metric Subscription Management
|
||||
|
||||
```rust
|
||||
// dashboard/src/metrics/subscription.rs
|
||||
pub struct SubscriptionManager {
|
||||
widget_subscriptions: HashMap<WidgetType, Vec<String>>,
|
||||
active_hosts: HashSet<String>,
|
||||
metric_filters: HashMap<String, MetricFilter>,
|
||||
}
|
||||
|
||||
impl SubscriptionManager {
|
||||
pub fn subscribe_widget(&mut self, widget: WidgetType, metrics: &[String])
|
||||
pub fn get_required_metrics(&self) -> Vec<String>
|
||||
pub fn add_host(&mut self, hostname: String)
|
||||
pub fn remove_host(&mut self, hostname: &str)
|
||||
pub fn is_metric_needed(&self, metric_name: &str) -> bool
|
||||
}
|
||||
```
|
||||
|
||||
## Widget Architecture
|
||||
|
||||
### Base Widget Trait
|
||||
|
||||
```rust
|
||||
// dashboard/src/ui/widgets/base.rs
|
||||
pub trait Widget {
|
||||
fn widget_type(&self) -> WidgetType;
|
||||
fn required_metrics(&self) -> &[&str];
|
||||
fn update_metrics(&mut self, metrics: &HashMap<String, Metric>);
|
||||
fn render(&self, frame: &mut Frame, area: Rect);
|
||||
fn handle_input(&mut self, event: &Event) -> bool;
|
||||
fn get_status(&self) -> Status;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)]
|
||||
pub enum WidgetType {
|
||||
Cpu,
|
||||
Memory,
|
||||
Storage,
|
||||
Services,
|
||||
Backup,
|
||||
Hosts,
|
||||
Alerts,
|
||||
}
|
||||
```
|
||||
|
||||
### Enhanced Widget Implementation
|
||||
|
||||
```rust
|
||||
// dashboard/src/ui/widgets/cpu.rs
|
||||
pub struct CpuWidget {
|
||||
metrics: HashMap<String, Metric>,
|
||||
config: CpuWidgetConfig,
|
||||
}
|
||||
|
||||
impl Widget for CpuWidget {
|
||||
fn required_metrics(&self) -> &[&str] {
|
||||
CPU_WIDGET_METRICS
|
||||
}
|
||||
|
||||
fn update_metrics(&mut self, metrics: &HashMap<String, Metric>) {
|
||||
// Update only the metrics this widget cares about
|
||||
for &metric_name in self.required_metrics() {
|
||||
if let Some(metric) = metrics.get(metric_name) {
|
||||
self.metrics.insert(metric_name.to_string(), metric.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn render(&self, frame: &mut Frame, area: Rect) {
|
||||
// Extract specific metric values for display
|
||||
let load_1min = self.get_metric_value("cpu_load_1min").unwrap_or(0.0);
|
||||
let load_5min = self.get_metric_value("cpu_load_5min").unwrap_or(0.0);
|
||||
let temperature = self.get_metric_value("cpu_temperature_celsius");
|
||||
|
||||
// Maintain existing UI layout and styling
|
||||
// ... render implementation preserving current appearance
|
||||
}
|
||||
|
||||
fn get_status(&self) -> Status {
|
||||
// Aggregate status from individual metric statuses
|
||||
self.metrics.values()
|
||||
.map(|m| &m.status)
|
||||
.max()
|
||||
.copied()
|
||||
.unwrap_or(Status::Unknown)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Host Management
|
||||
|
||||
### Multi-Host Connection Management
|
||||
|
||||
```rust
|
||||
// dashboard/src/hosts/manager.rs
|
||||
pub struct HostManager {
|
||||
connections: HashMap<String, HostConnection>,
|
||||
discovery: HostDiscovery,
|
||||
active_host: Option<String>,
|
||||
metric_store: Arc<Mutex<MetricStore>>,
|
||||
}
|
||||
|
||||
impl HostManager {
|
||||
pub async fn discover_hosts(&mut self) -> Result<Vec<String>>
|
||||
pub async fn connect_to_host(&mut self, hostname: &str) -> Result<()>
|
||||
pub fn disconnect_from_host(&mut self, hostname: &str)
|
||||
pub fn set_active_host(&mut self, hostname: String)
|
||||
pub fn get_active_host(&self) -> Option<&str>
|
||||
pub fn get_connected_hosts(&self) -> Vec<&str>
|
||||
pub async fn refresh_all_hosts(&mut self) -> Result<()>
|
||||
}
|
||||
|
||||
// dashboard/src/hosts/connection.rs
|
||||
pub struct HostConnection {
|
||||
hostname: String,
|
||||
zmq_consumer: ZmqConsumer,
|
||||
last_seen: Instant,
|
||||
connection_status: ConnectionStatus,
|
||||
metric_buffer: VecDeque<Metric>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ConnectionStatus {
|
||||
Connected,
|
||||
Connecting,
|
||||
Disconnected,
|
||||
Error(String),
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration Integration
|
||||
|
||||
### Dashboard Configuration
|
||||
|
||||
```toml
|
||||
# dashboard/config/dashboard.toml
|
||||
[zmq]
|
||||
subscriber_ports = [6130] # Ports to listen on for metrics
|
||||
connection_timeout_ms = 15000
|
||||
reconnect_interval_ms = 5000
|
||||
|
||||
[ui]
|
||||
refresh_rate_ms = 100
|
||||
theme = "default"
|
||||
preserve_layout = true
|
||||
|
||||
[hosts]
|
||||
auto_discovery = true
|
||||
predefined_hosts = ["cmbox", "labbox", "simonbox", "steambox", "srv01"]
|
||||
default_host = "cmbox"
|
||||
|
||||
[metrics]
|
||||
history_retention_hours = 24
|
||||
max_metrics_per_host = 10000
|
||||
|
||||
[widgets.cpu]
|
||||
enabled = true
|
||||
metrics = [
|
||||
"cpu_load_1min",
|
||||
"cpu_load_5min",
|
||||
"cpu_load_15min",
|
||||
"cpu_temperature_celsius"
|
||||
]
|
||||
|
||||
[widgets.memory]
|
||||
enabled = true
|
||||
metrics = [
|
||||
"memory_usage_percent",
|
||||
"memory_total_gb",
|
||||
"memory_available_gb"
|
||||
]
|
||||
|
||||
[widgets.storage]
|
||||
enabled = true
|
||||
metrics = [
|
||||
"disk_nvme0_temperature_celsius",
|
||||
"disk_nvme0_wear_percent",
|
||||
"disk_nvme0_usage_percent"
|
||||
]
|
||||
```
|
||||
|
||||
## UI Layout Preservation Rules
|
||||
|
||||
### 1. Maintain Current Widget Positions
|
||||
|
||||
- **CPU widget**: Top-left position preserved
|
||||
- **Memory widget**: Top-right position preserved
|
||||
- **Storage widget**: Left-center position preserved
|
||||
- **Services widget**: Right-center position preserved
|
||||
- **Backup widget**: Bottom-right position preserved
|
||||
- **Host navigation**: Bottom status bar preserved
|
||||
|
||||
### 2. Preserve Visual Styling
|
||||
|
||||
- **Colors**: Existing status colors (green, yellow, red) maintained
|
||||
- **Borders**: Current border styles and characters preserved
|
||||
- **Text formatting**: Font styles, alignment, and spacing preserved
|
||||
- **Progress bars**: Current progress bar implementations maintained
|
||||
|
||||
### 3. Maintain User Interactions
|
||||
|
||||
- **Navigation keys**: `←→` for host switching preserved
|
||||
- **Refresh key**: `r` for manual refresh preserved
|
||||
- **Quit key**: `q` for exit preserved
|
||||
- **Additional keys**: All current keyboard shortcuts maintained
|
||||
|
||||
### 4. Status Display Consistency
|
||||
|
||||
- **Status aggregation**: Widget-level status calculated from individual metric statuses
|
||||
- **Color mapping**: Status enum maps to existing color scheme
|
||||
- **Status indicators**: Current status display format preserved
|
||||
|
||||
## Implementation Migration Strategy
|
||||
|
||||
### Phase 1: Shared Types
|
||||
1. Create `shared/` crate with common protocol and metric types
|
||||
2. Update both agent and dashboard to use shared types
|
||||
|
||||
### Phase 2: Agent Migration
|
||||
1. Implement new agent architecture with individual metrics
|
||||
2. Maintain backward compatibility during transition
|
||||
|
||||
### Phase 3: Dashboard Migration
|
||||
1. Update dashboard to consume individual metrics
|
||||
2. Preserve all existing UI layouts and interactions
|
||||
3. Enhance widgets with new metric subscription system
|
||||
|
||||
### Phase 4: Integration Testing
|
||||
1. End-to-end testing with real multi-host scenarios
|
||||
2. Performance validation and optimization
|
||||
3. UI/UX validation to ensure no regressions
|
||||
|
||||
## Benefits of This Architecture
|
||||
|
||||
1. **Maximum Flexibility**: Dashboard can compose any widget from any metrics
|
||||
2. **Easy Extension**: Adding new metrics doesn't affect existing code
|
||||
3. **Granular Caching**: Cache individual metrics based on collection cost
|
||||
4. **Simple Testing**: Test individual metric collection in isolation
|
||||
5. **Clear Separation**: Agent collects, dashboard consumes and displays
|
||||
6. **Efficient Updates**: Only send changed metrics to dashboard
|
||||
|
||||
## Future Extensions
|
||||
|
||||
- **Metric Filtering**: Dashboard requests only needed metrics
|
||||
- **Historical Storage**: Store metric history for trending
|
||||
- **Metric Aggregation**: Calculate derived metrics from base metrics
|
||||
- **Dynamic Discovery**: Auto-discover new metric sources
|
||||
- **Metric Validation**: Validate metric values and ranges
|
||||
58
BENCHMARK.md
58
BENCHMARK.md
@@ -1,58 +0,0 @@
|
||||
# CM Dashboard Agent Performance Benchmark
|
||||
|
||||
## Test Environment
|
||||
- Host: srv01
|
||||
- Rust: release build with optimizations
|
||||
- Test date: 2025-10-16
|
||||
- Collection interval: 5 seconds (realtime for all collectors)
|
||||
|
||||
## Benchmark Methodology
|
||||
1. Set all collectors to realtime (5s interval)
|
||||
2. Test each collector individually
|
||||
3. Measure CPU usage with `ps aux` after 10 seconds
|
||||
4. Record collection time from debug logs
|
||||
|
||||
## Baseline - All Collectors Enabled
|
||||
|
||||
### Results
|
||||
- **CPU Usage**: 74.6%
|
||||
- **Total Metrics**: ~80 (5 CPU + 6 Memory + 3 Disk + ~66 Systemd)
|
||||
- **Collection Time**: ~1350ms (dominated by systemd collector)
|
||||
|
||||
## Individual Collector Tests
|
||||
|
||||
### CPU Collector Only
|
||||
- **CPU Usage**: TBD%
|
||||
- **Metrics Count**: TBD
|
||||
- **Collection Time**: TBD ms
|
||||
- **Utilities Used**: `/proc/loadavg`, `/sys/class/thermal/thermal_zone*/temp`, `/proc/cpuinfo`
|
||||
|
||||
### Memory Collector Only
|
||||
- **CPU Usage**: TBD%
|
||||
- **Metrics Count**: TBD
|
||||
- **Collection Time**: TBD ms
|
||||
- **Utilities Used**: `/proc/meminfo`
|
||||
|
||||
### Disk Collector Only
|
||||
- **CPU Usage**: TBD%
|
||||
- **Metrics Count**: TBD
|
||||
- **Collection Time**: TBD ms
|
||||
- **Utilities Used**: `du -s /tmp`
|
||||
|
||||
### Systemd Collector Only
|
||||
- **CPU Usage**: TBD%
|
||||
- **Metrics Count**: TBD
|
||||
- **Collection Time**: TBD ms
|
||||
- **Utilities Used**: `systemctl list-units`, `systemctl show <service>`, `du -s <service-dir>`
|
||||
|
||||
## Analysis
|
||||
|
||||
### Performance Bottlenecks
|
||||
- TBD
|
||||
|
||||
### Recommendations
|
||||
- TBD
|
||||
|
||||
### Optimal Cache Intervals
|
||||
Based on performance impact:
|
||||
- TBD
|
||||
@@ -1,85 +0,0 @@
|
||||
# CM Dashboard Cache Optimization Summary
|
||||
|
||||
## 🎯 Goal Achieved: CPU Usage < 1%
|
||||
|
||||
From benchmark testing, we discovered that separating collectors based on disk I/O patterns provides optimal performance.
|
||||
|
||||
## 📊 Optimized Cache Tiers (Based on Disk I/O)
|
||||
|
||||
### ⚡ **REALTIME** (5 seconds) - Memory/CPU Operations
|
||||
**No disk I/O - fastest operations**
|
||||
- `cpu_load_*` - CPU load averages (reading /proc/loadavg)
|
||||
- `cpu_temperature_*` - CPU temperature (reading /sys)
|
||||
- `cpu_frequency_*` - CPU frequency (reading /sys)
|
||||
- `memory_*` - Memory usage (reading /proc/meminfo)
|
||||
- `service_*_cpu_percent` - Service CPU usage (from systemctl show)
|
||||
- `service_*_memory_mb` - Service memory usage (from systemctl show)
|
||||
- `network_*` - Network statistics (reading /proc/net)
|
||||
|
||||
### 🔸 **DISK_LIGHT** (1 minute) - Light Disk Operations
|
||||
**Service status checks**
|
||||
- `service_*_status` - Service status (systemctl is-active)
|
||||
|
||||
### 🔹 **DISK_MEDIUM** (5 minutes) - Medium Disk Operations
|
||||
**Disk usage commands (du)**
|
||||
- `service_*_disk_gb` - Service disk usage (du commands)
|
||||
- `disk_tmp_*` - Temporary disk usage
|
||||
- `disk_*_usage_*` - General disk usage metrics
|
||||
- `disk_*_size_*` - Disk size metrics
|
||||
|
||||
### 🔶 **DISK_HEAVY** (15 minutes) - Heavy Disk Operations
|
||||
**SMART data, backup checks**
|
||||
- `disk_*_temperature` - SMART temperature data
|
||||
- `disk_*_wear_percent` - SMART wear leveling
|
||||
- `smart_*` - All SMART metrics
|
||||
- `backup_*` - Backup status checks
|
||||
|
||||
### 🔷 **STATIC** (1 hour) - Hardware Info
|
||||
**Rarely changing information**
|
||||
- Hardware specifications
|
||||
- System capabilities
|
||||
|
||||
## 🔧 Technical Implementation
|
||||
|
||||
### Pattern Matching
|
||||
```rust
|
||||
fn matches_pattern(&self, metric_name: &str, pattern: &str) -> bool {
|
||||
// Supports patterns like:
|
||||
// "cpu_*" - prefix matching
|
||||
// "*_status" - suffix matching
|
||||
// "service_*_disk_gb" - prefix + suffix matching
|
||||
}
|
||||
```
|
||||
|
||||
### Cache Assignment Logic
|
||||
```rust
|
||||
pub fn get_cache_interval(&self, metric_name: &str) -> u64 {
|
||||
self.get_tier_for_metric(metric_name)
|
||||
.map(|tier| tier.interval_seconds)
|
||||
.unwrap_or(self.default_ttl_seconds) // 30s fallback
|
||||
}
|
||||
```
|
||||
|
||||
## 📈 Performance Results
|
||||
|
||||
| Operation Type | Cache Interval | Example Metrics | Expected CPU Impact |
|
||||
|---|---|---|---|
|
||||
| Memory/CPU reads | 5s | `cpu_load_1min`, `memory_usage_percent` | Minimal |
|
||||
| Service status | 1min | `service_nginx_status` | Low |
|
||||
| Disk usage (du) | 5min | `service_nginx_disk_gb` | Medium |
|
||||
| SMART data | 15min | `disk_nvme0_temperature` | High |
|
||||
|
||||
## 🎯 Key Benefits
|
||||
|
||||
1. **CPU Efficiency**: Non-disk operations run at realtime (5s) with minimal CPU impact
|
||||
2. **Disk I/O Optimization**: Heavy disk operations cached for 5-15 minutes
|
||||
3. **Responsive Monitoring**: Critical metrics (CPU, memory) updated every 5 seconds
|
||||
4. **Intelligent Caching**: Operations cached based on their actual resource cost
|
||||
|
||||
## 🧪 Test Results
|
||||
|
||||
- **Before optimization**: 10% CPU usage (unacceptable)
|
||||
- **After optimization**: 0.3% CPU usage (99.6% improvement)
|
||||
- **Target achieved**: < 1% CPU usage ✅
|
||||
|
||||
This configuration provides optimal balance between responsiveness and resource efficiency.
|
||||
791
CLAUDE.md
791
CLAUDE.md
@@ -2,599 +2,382 @@
|
||||
|
||||
## Overview
|
||||
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for our specific monitoring needs and ZMQ-based metric collection.
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built with ZMQ-based metric collection and individual metrics architecture.
|
||||
|
||||
## CRITICAL: Architecture Redesign in Progress
|
||||
## Current Features
|
||||
|
||||
**LEGACY CODE DEPRECATION**: The current codebase is being completely rewritten with a new individual metrics architecture. ALL existing code will be moved to a backup folder for reference only.
|
||||
### Core Functionality
|
||||
|
||||
**NEW IMPLEMENTATION STRATEGY**:
|
||||
- **NO legacy code reuse** - Fresh implementation following ARCHITECT.md
|
||||
- **Clean slate approach** - Build entirely new codebase structure
|
||||
- **Reference-only legacy** - Current code preserved only for functionality reference
|
||||
- **Real-time Monitoring**: CPU, RAM, Storage, and Service status
|
||||
- **Service Management**: Start/stop services with user-stopped tracking
|
||||
- **Multi-host Support**: Monitor multiple servers from single dashboard
|
||||
- **NixOS Integration**: System rebuild via SSH + tmux popup
|
||||
- **Backup Monitoring**: Borgbackup status and scheduling
|
||||
|
||||
## Implementation Strategy
|
||||
### User-Stopped Service Tracking
|
||||
|
||||
### Phase 1: Legacy Code Backup (IMMEDIATE)
|
||||
- Services stopped via dashboard are marked as "user-stopped"
|
||||
- User-stopped services report Status::OK instead of Warning
|
||||
- Prevents false alerts during intentional maintenance
|
||||
- Persistent storage survives agent restarts
|
||||
- Automatic flag clearing when services are restarted via dashboard
|
||||
|
||||
**Backup Current Implementation:**
|
||||
```bash
|
||||
# Create backup folder for reference
|
||||
mkdir -p backup/legacy-2025-10-16
|
||||
### Custom Service Logs
|
||||
|
||||
# Move all current source code to backup
|
||||
mv agent/ backup/legacy-2025-10-16/
|
||||
mv dashboard/ backup/legacy-2025-10-16/
|
||||
mv shared/ backup/legacy-2025-10-16/
|
||||
|
||||
# Preserve configuration examples
|
||||
cp -r config/ backup/legacy-2025-10-16/
|
||||
|
||||
# Keep important documentation
|
||||
cp CLAUDE.md backup/legacy-2025-10-16/CLAUDE-legacy.md
|
||||
cp README.md backup/legacy-2025-10-16/README-legacy.md
|
||||
```
|
||||
|
||||
**Reference Usage Rules:**
|
||||
- Legacy code is **REFERENCE ONLY** - never copy/paste
|
||||
- Study existing functionality and UI layout patterns
|
||||
- Understand current widget behavior and status mapping
|
||||
- Reference notification logic and email formatting
|
||||
- NO legacy code in new implementation
|
||||
|
||||
### Phase 2: Clean Slate Implementation
|
||||
|
||||
**New Codebase Structure:**
|
||||
Following ARCHITECT.md precisely with zero legacy dependencies:
|
||||
|
||||
```
|
||||
cm-dashboard/ # New clean repository root
|
||||
├── ARCHITECT.md # Architecture documentation
|
||||
├── CLAUDE.md # This file (updated)
|
||||
├── README.md # New implementation documentation
|
||||
├── Cargo.toml # Workspace configuration
|
||||
├── agent/ # New agent implementation
|
||||
│ ├── Cargo.toml
|
||||
│ └── src/ ... (per ARCHITECT.md)
|
||||
├── dashboard/ # New dashboard implementation
|
||||
│ ├── Cargo.toml
|
||||
│ └── src/ ... (per ARCHITECT.md)
|
||||
├── shared/ # New shared types
|
||||
│ ├── Cargo.toml
|
||||
│ └── src/ ... (per ARCHITECT.md)
|
||||
├── config/ # New configuration examples
|
||||
└── backup/ # Legacy code for reference
|
||||
└── legacy-2025-10-16/
|
||||
```
|
||||
|
||||
### Phase 3: Implementation Priorities
|
||||
|
||||
**Agent Implementation (Priority 1):**
|
||||
1. Individual metrics collection system
|
||||
2. ZMQ communication protocol
|
||||
3. Basic collectors (CPU, memory, disk, services)
|
||||
4. Status calculation and thresholds
|
||||
5. Email notification system
|
||||
|
||||
**Dashboard Implementation (Priority 2):**
|
||||
1. ZMQ metric consumer
|
||||
2. Metric storage and subscription system
|
||||
3. Base widget trait and framework
|
||||
4. Core widgets (CPU, memory, storage, services)
|
||||
5. Host management and navigation
|
||||
|
||||
**Testing & Integration (Priority 3):**
|
||||
1. End-to-end metric flow validation
|
||||
2. Multi-host connection testing
|
||||
3. UI layout validation against legacy appearance
|
||||
4. Performance benchmarking
|
||||
|
||||
## Project Goals (Updated)
|
||||
|
||||
### Core Objectives
|
||||
|
||||
- **Individual metric architecture** for maximum dashboard flexibility
|
||||
- **Multi-host support** for cmbox, labbox, simonbox, steambox, srv01
|
||||
- **Performance-focused** with minimal resource usage
|
||||
- **Keyboard-driven interface** preserving current UI layout
|
||||
- **ZMQ-based communication** replacing HTTP API polling
|
||||
|
||||
### Key Features
|
||||
|
||||
- **Granular metric collection** (cpu_load_1min, memory_usage_percent, etc.)
|
||||
- **Widget-based metric subscription** for flexible dashboard composition
|
||||
- **Preserved UI layout** maintaining current visual design
|
||||
- **Intelligent caching** for optimal performance
|
||||
- **Auto-discovery** of services and system components
|
||||
- **Email notifications** for status changes with rate limiting
|
||||
- **Maintenance mode** integration for planned downtime
|
||||
|
||||
## New Technical Architecture
|
||||
|
||||
### Technology Stack (Updated)
|
||||
|
||||
- **Language**: Rust 🦀
|
||||
- **Communication**: ZMQ (zeromq) for agent-dashboard messaging
|
||||
- **TUI Framework**: ratatui (modern tui-rs fork)
|
||||
- **Async Runtime**: tokio
|
||||
- **Serialization**: serde (JSON for metrics)
|
||||
- **CLI**: clap
|
||||
- **Error Handling**: thiserror + anyhow
|
||||
- **Time**: chrono
|
||||
- **Email**: lettre (SMTP notifications)
|
||||
|
||||
### New Dependencies
|
||||
- Configure service-specific log file paths per host in dashboard config
|
||||
- Press `L` on any service to view custom log files via `tail -f`
|
||||
- Configuration format in dashboard config:
|
||||
|
||||
```toml
|
||||
# Workspace Cargo.toml
|
||||
[workspace]
|
||||
members = ["agent", "dashboard", "shared"]
|
||||
|
||||
# Agent dependencies
|
||||
[dependencies.agent]
|
||||
zmq = "0.10" # ZMQ communication
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
tokio = { version = "1.0", features = ["full"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
thiserror = "1.0"
|
||||
anyhow = "1.0"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
lettre = { version = "0.11", features = ["smtp-transport"] }
|
||||
gethostname = "0.4"
|
||||
|
||||
# Dashboard dependencies
|
||||
[dependencies.dashboard]
|
||||
ratatui = "0.24"
|
||||
crossterm = "0.27"
|
||||
zmq = "0.10"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
tokio = { version = "1.0", features = ["full"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
thiserror = "1.0"
|
||||
anyhow = "1.0"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
||||
# Shared dependencies
|
||||
[dependencies.shared]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
thiserror = "1.0"
|
||||
[service_logs]
|
||||
hostname1 = [
|
||||
{ service_name = "nginx", log_file_path = "/var/log/nginx/access.log" },
|
||||
{ service_name = "app", log_file_path = "/var/log/myapp/app.log" }
|
||||
]
|
||||
hostname2 = [
|
||||
{ service_name = "database", log_file_path = "/var/log/postgres/postgres.log" }
|
||||
]
|
||||
```
|
||||
|
||||
## New Project Structure
|
||||
### Service Management
|
||||
|
||||
**REFERENCE**: See ARCHITECT.md for complete folder structure specification.
|
||||
- **Direct Control**: Arrow keys (↑↓) or vim keys (j/k) navigate services
|
||||
- **Service Actions**:
|
||||
- `s` - Start service (sends UserStart command)
|
||||
- `S` - Stop service (sends UserStop command)
|
||||
- `J` - Show service logs (journalctl in tmux popup)
|
||||
- `L` - Show custom log files (tail -f custom paths in tmux popup)
|
||||
- `R` - Rebuild current host
|
||||
- **Visual Status**: Green ● (active), Yellow ◐ (inactive), Red ◯ (failed)
|
||||
- **Transitional Icons**: Blue arrows during operations
|
||||
|
||||
**Current Status**: Legacy code preserved in `backup/legacy-2025-10-16/` for reference only.
|
||||
### Navigation
|
||||
|
||||
**Implementation Progress**:
|
||||
- [x] Architecture documentation (ARCHITECT.md)
|
||||
- [x] Implementation strategy (CLAUDE.md updates)
|
||||
- [ ] Legacy code backup
|
||||
- [ ] New workspace setup
|
||||
- [ ] Shared types implementation
|
||||
- [ ] Agent implementation
|
||||
- [ ] Dashboard implementation
|
||||
- [ ] Integration testing
|
||||
- **Tab**: Switch between hosts
|
||||
- **↑↓ or j/k**: Select services
|
||||
- **s**: Start selected service (UserStart)
|
||||
- **S**: Stop selected service (UserStop)
|
||||
- **J**: Show service logs (journalctl)
|
||||
- **L**: Show custom log files
|
||||
- **R**: Rebuild current host
|
||||
- **B**: Run backup on current host
|
||||
- **q**: Quit dashboard
|
||||
|
||||
### New Individual Metrics Architecture
|
||||
## Core Architecture Principles
|
||||
|
||||
**REPLACED**: Legacy grouped structures (SmartMetrics, ServiceMetrics, etc.) are replaced with individual metrics.
|
||||
### Structured Data Architecture (✅ IMPLEMENTED v0.1.131)
|
||||
|
||||
**New Approach**: See ARCHITECT.md for individual metric definitions:
|
||||
Complete migration from string-based metrics to structured JSON data. Eliminates all string parsing bugs and provides type-safe data access.
|
||||
|
||||
```rust
|
||||
// Individual metrics examples:
|
||||
"cpu_load_1min" -> 2.5
|
||||
"cpu_temperature_celsius" -> 45.0
|
||||
"memory_usage_percent" -> 78.5
|
||||
"disk_nvme0_wear_percent" -> 12.3
|
||||
"service_ssh_status" -> "active"
|
||||
"backup_last_run_timestamp" -> 1697123456
|
||||
```
|
||||
**Previous (String Metrics):**
|
||||
|
||||
**Shared Types**: Located in `shared/src/metrics.rs`:
|
||||
- ❌ Agent sent individual metrics with string names like `disk_nvme0n1_temperature`
|
||||
- ❌ Dashboard parsed metric names with underscore counting and string splitting
|
||||
- ❌ Complex and error-prone metric filtering and extraction logic
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Metric {
|
||||
pub name: String,
|
||||
pub value: MetricValue,
|
||||
pub status: Status,
|
||||
pub timestamp: u64,
|
||||
pub description: Option<String>,
|
||||
pub unit: Option<String>,
|
||||
}
|
||||
**Current (Structured Data):**
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum MetricValue {
|
||||
Float(f32),
|
||||
Integer(i64),
|
||||
String(String),
|
||||
Boolean(bool),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum Status {
|
||||
Ok,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
```json
|
||||
{
|
||||
"hostname": "cmbox",
|
||||
"agent_version": "v0.1.131",
|
||||
"timestamp": 1763926877,
|
||||
"system": {
|
||||
"cpu": {
|
||||
"load_1min": 3.5,
|
||||
"load_5min": 3.57,
|
||||
"load_15min": 3.58,
|
||||
"frequency_mhz": 1500,
|
||||
"temperature_celsius": 45.2
|
||||
},
|
||||
"memory": {
|
||||
"usage_percent": 25.0,
|
||||
"total_gb": 23.3,
|
||||
"used_gb": 5.9,
|
||||
"swap_total_gb": 10.7,
|
||||
"swap_used_gb": 0.99,
|
||||
"tmpfs": [
|
||||
{
|
||||
"mount": "/tmp",
|
||||
"usage_percent": 15.0,
|
||||
"used_gb": 0.3,
|
||||
"total_gb": 2.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"storage": {
|
||||
"drives": [
|
||||
{
|
||||
"name": "nvme0n1",
|
||||
"health": "PASSED",
|
||||
"temperature_celsius": 29.0,
|
||||
"wear_percent": 1.0,
|
||||
"filesystems": [
|
||||
{
|
||||
"mount": "/",
|
||||
"usage_percent": 24.0,
|
||||
"used_gb": 224.9,
|
||||
"total_gb": 928.2
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"pools": [
|
||||
{
|
||||
"name": "srv_media",
|
||||
"mount": "/srv/media",
|
||||
"type": "mergerfs",
|
||||
"health": "healthy",
|
||||
"usage_percent": 63.0,
|
||||
"used_gb": 2355.2,
|
||||
"total_gb": 3686.4,
|
||||
"data_drives": [{ "name": "sdb", "temperature_celsius": 24.0 }],
|
||||
"parity_drives": [{ "name": "sdc", "temperature_celsius": 24.0 }]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"services": [
|
||||
{ "name": "sshd", "status": "active", "memory_mb": 4.5, "disk_gb": 0.0 }
|
||||
],
|
||||
"backup": {
|
||||
"status": "completed",
|
||||
"last_run": 1763920000,
|
||||
"next_scheduled": 1764006400,
|
||||
"total_size_gb": 150.5,
|
||||
"repository_health": "ok"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## UI Layout Preservation
|
||||
|
||||
**CRITICAL**: The exact visual layout shown above is **PRESERVED** in the new implementation.
|
||||
|
||||
**Implementation Strategy**:
|
||||
- New widgets subscribe to individual metrics but render identically
|
||||
- Same positions, colors, borders, and keyboard shortcuts
|
||||
- Enhanced with flexible metric composition under the hood
|
||||
|
||||
**Reference**: Legacy widgets in `backup/legacy-2025-10-16/dashboard/src/ui/` show exact rendering logic to replicate.
|
||||
|
||||
## Core Architecture Principles - CRITICAL
|
||||
|
||||
### Individual Metrics Philosophy
|
||||
|
||||
**NEW ARCHITECTURE**: Agent collects individual metrics, dashboard composes widgets from those metrics.
|
||||
|
||||
**Status Calculation**:
|
||||
- Agent calculates status for each individual metric
|
||||
- Agent sends individual metrics with status via ZMQ
|
||||
- Dashboard aggregates metric statuses for widget-level status
|
||||
- Dashboard NEVER calculates metric status - only displays and aggregates
|
||||
|
||||
**Data Flow Architecture:**
|
||||
```
|
||||
Agent (individual metrics + status) → ZMQ → Dashboard (subscribe + display) → Widgets (compose + render)
|
||||
```
|
||||
|
||||
### Migration from Legacy Architecture
|
||||
|
||||
**OLD (DEPRECATED)**:
|
||||
```
|
||||
Agent → ServiceMetrics{summary, services} → Dashboard → Widget
|
||||
Agent → SmartMetrics{drives, summary} → Dashboard → Widget
|
||||
```
|
||||
|
||||
**NEW (IMPLEMENTING)**:
|
||||
```
|
||||
Agent → ["cpu_load_1min", "memory_usage_percent", ...] → Dashboard → Widgets subscribe to needed metrics
|
||||
```
|
||||
|
||||
### Current Agent Thresholds (as of 2025-10-12)
|
||||
|
||||
**CPU Load (service.rs:392-400):**
|
||||
- Warning: ≥ 2.0 (testing value, was 5.0)
|
||||
- Critical: ≥ 4.0 (testing value, was 8.0)
|
||||
|
||||
**CPU Temperature (service.rs:412-420):**
|
||||
- Warning: ≥ 70.0°C
|
||||
- Critical: ≥ 80.0°C
|
||||
|
||||
**Memory Usage (service.rs:402-410):**
|
||||
- Warning: ≥ 80%
|
||||
- Critical: ≥ 95%
|
||||
|
||||
### Email Notifications
|
||||
|
||||
**System Configuration:**
|
||||
- From: `{hostname}@cmtec.se` (e.g., cmbox@cmtec.se)
|
||||
- To: `cm@cmtec.se`
|
||||
- SMTP: localhost:25 (postfix)
|
||||
- Timezone: Europe/Stockholm (not UTC)
|
||||
|
||||
**Notification Triggers:**
|
||||
- Status degradation: any → "warning" or "critical"
|
||||
- Recovery: "warning"/"critical" → "ok"
|
||||
- Rate limiting: configurable (set to 0 for testing, 30 minutes for production)
|
||||
|
||||
**Monitored Components:**
|
||||
- system.cpu (load status) - SystemCollector
|
||||
- system.memory (usage status) - SystemCollector
|
||||
- system.cpu_temp (temperature status) - SystemCollector (disabled)
|
||||
- system.services (service health status) - ServiceCollector
|
||||
- storage.smart (drive health) - SmartCollector
|
||||
- backup.overall (backup status) - BackupCollector
|
||||
|
||||
### Pure Auto-Discovery Implementation
|
||||
|
||||
**Agent Configuration:**
|
||||
- No config files required
|
||||
- Auto-detects storage devices, services, backup systems
|
||||
- Runtime discovery of system capabilities
|
||||
- CLI: `cm-dashboard-agent [-v]` (intelligent caching enabled)
|
||||
|
||||
**Service Discovery:**
|
||||
- Scans running systemd services
|
||||
- Filters by predefined interesting patterns (gitea, nginx, docker, etc.)
|
||||
- No host-specific hardcoded service lists
|
||||
|
||||
### Current Implementation Status
|
||||
|
||||
**Completed:**
|
||||
- [x] Pure auto-discovery agent (no config files)
|
||||
- [x] Agent-side status calculations with defined thresholds
|
||||
- [x] Dashboard displays agent status (no dashboard calculations)
|
||||
- [x] Email notifications with Stockholm timezone
|
||||
- [x] CPU temperature monitoring and notifications
|
||||
- [x] ZMQ message format standardization
|
||||
- [x] Removed all hardcoded dashboard thresholds
|
||||
- [x] CPU thresholds restored to production values (5.0/8.0)
|
||||
- [x] All collectors output standardized status strings (ok/warning/critical/unknown)
|
||||
- [x] Dashboard connection loss detection with 5-second keep-alive
|
||||
- [x] Removed excessive logging from agent
|
||||
- [x] Fixed all compiler warnings in both agent and dashboard
|
||||
- [x] **SystemCollector architecture refactoring completed (2025-10-12)**
|
||||
- [x] Created SystemCollector for CPU load, memory, temperature, C-states
|
||||
- [x] Moved system metrics from ServiceCollector to SystemCollector
|
||||
- [x] Updated dashboard to parse and display SystemCollector data
|
||||
- [x] Enhanced service notifications to include specific failure details
|
||||
- [x] CPU temperature thresholds set to 100°C (effectively disabled)
|
||||
- [x] **SystemCollector bug fixes completed (2025-10-12)**
|
||||
- [x] Fixed CPU load parsing for comma decimal separator locale (", " split)
|
||||
- [x] Fixed CPU temperature to prioritize x86_pkg_temp over generic thermal zones
|
||||
- [x] Fixed C-state collection to discover all available states (including C10)
|
||||
- [x] **Dashboard improvements and maintenance mode (2025-10-13)**
|
||||
- [x] Host auto-discovery with predefined CMTEC infrastructure hosts (cmbox, labbox, simonbox, steambox, srv01)
|
||||
- [x] Host navigation limited to connected hosts only (no disconnected host cycling)
|
||||
- [x] Storage widget restructured: Name/Temp/Wear/Usage columns with SMART details as descriptions
|
||||
- [x] Agent-provided descriptions for Storage widget (agent is source of truth for formatting)
|
||||
- [x] Maintenance mode implementation: /tmp/cm-maintenance file suppresses notifications
|
||||
- [x] NixOS borgbackup integration with automatic maintenance mode during backups
|
||||
- [x] System widget simplified to single row with C-states as description lines
|
||||
- [x] CPU load thresholds updated to production values (9.0/10.0)
|
||||
- [x] **Smart caching system implementation (2025-10-15)**
|
||||
- [x] Comprehensive intelligent caching with tiered collection intervals (RealTime/Fast/Medium/Slow/Static)
|
||||
- [x] Cache warming for instant dashboard startup responsiveness
|
||||
- [x] Background refresh and proactive cache invalidation strategies
|
||||
- [x] CPU usage optimization from 9.5% to <2% through smart polling reduction
|
||||
- [x] Cache key consistency fixes for proper collector data flow
|
||||
- [x] ZMQ broadcast mechanism ensuring continuous data delivery to dashboard
|
||||
- [x] Immich service quota detection fix (500GB instead of hardcoded 200GB)
|
||||
- [x] Service-to-directory mapping for accurate disk usage calculation
|
||||
- [x] **Real-time process monitoring implementation (2025-10-16)**
|
||||
- [x] Fixed hardcoded top CPU/RAM process display with real data
|
||||
- [x] Added top CPU and RAM process collection to CpuCollector
|
||||
- [x] Implemented ps-based process monitoring with accurate percentages
|
||||
- [x] Added intelligent filtering to avoid self-monitoring artifacts
|
||||
- [x] Dashboard updated to display real-time top processes instead of placeholder text
|
||||
- [x] Fixed disk metrics permission issues in systemd collector
|
||||
- [x] Enhanced error logging for service directory access problems
|
||||
- [x] Optimized service collection focusing on status, memory, and disk metrics only
|
||||
- [x] **Comprehensive backup monitoring implementation (2025-10-18)**
|
||||
- [x] Added BackupCollector for reading TOML status files with disk space metrics
|
||||
- [x] Implemented BackupWidget with disk usage display and service status details
|
||||
- [x] Fixed backup script disk space parsing by adding missing capture_output=True
|
||||
- [x] Updated backup widget to show actual disk usage instead of repository size
|
||||
- [x] Fixed timestamp parsing to use backup completion time instead of start time
|
||||
- [x] Resolved timezone issues by using UTC timestamps in backup script
|
||||
- [x] Added disk identification metrics (product name, serial number) to backup status
|
||||
- [x] Enhanced UI layout with proper backup monitoring integration
|
||||
|
||||
**Production Configuration:**
|
||||
- CPU load thresholds: Warning ≥ 9.0, Critical ≥ 10.0
|
||||
- CPU temperature thresholds: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled)
|
||||
- Memory usage thresholds: Warning ≥ 80%, Critical ≥ 95%
|
||||
- Connection timeout: 15 seconds (agents send data every 5 seconds)
|
||||
- Email rate limiting: 30 minutes (set to 0 for testing)
|
||||
- ✅ Agent sends structured JSON over ZMQ (no legacy support)
|
||||
- ✅ Type-safe data access: `data.system.storage.drives[0].temperature_celsius`
|
||||
- ✅ Complete metric coverage: CPU, memory, storage, services, backup
|
||||
- ✅ Backward compatibility via bridge conversion to existing UI widgets
|
||||
- ✅ All string parsing bugs eliminated
|
||||
|
||||
### Maintenance Mode
|
||||
|
||||
**Purpose:**
|
||||
- Suppress email notifications during planned maintenance or backups
|
||||
- Prevents false alerts when services are intentionally stopped
|
||||
|
||||
**Implementation:**
|
||||
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
||||
- File presence suppresses all email notifications while continuing monitoring
|
||||
- Dashboard continues to show real status, only notifications are blocked
|
||||
|
||||
**Usage:**
|
||||
Usage:
|
||||
|
||||
```bash
|
||||
# Enable maintenance mode
|
||||
touch /tmp/cm-maintenance
|
||||
|
||||
# Run maintenance tasks (backups, service restarts, etc.)
|
||||
# Run maintenance tasks
|
||||
systemctl stop service
|
||||
# ... maintenance work ...
|
||||
systemctl start service
|
||||
|
||||
# Disable maintenance mode
|
||||
# Disable maintenance mode
|
||||
rm /tmp/cm-maintenance
|
||||
```
|
||||
|
||||
**NixOS Integration:**
|
||||
- Borgbackup script automatically creates/removes maintenance file
|
||||
- Automatic cleanup via trap ensures maintenance mode doesn't stick
|
||||
## Development and Deployment Architecture
|
||||
|
||||
### Configuration-Based Smart Caching System
|
||||
### Development Path
|
||||
|
||||
**Purpose:**
|
||||
- Reduce agent CPU usage from 10% to <1% through configuration-driven intelligent caching
|
||||
- Maintain dashboard responsiveness with configurable refresh strategies
|
||||
- Optimize for different data volatility characteristics via config files
|
||||
- **Location:** `~/projects/cm-dashboard`
|
||||
- **Purpose:** Development workflow only - for committing new code
|
||||
- **Access:** Only for developers to commit changes
|
||||
|
||||
**Configuration-Driven Architecture:**
|
||||
```toml
|
||||
# Cache tiers defined in agent.toml
|
||||
[cache.tiers.realtime]
|
||||
interval_seconds = 5
|
||||
description = "High-frequency metrics (CPU load, memory usage)"
|
||||
### Deployment Path
|
||||
|
||||
[cache.tiers.medium]
|
||||
interval_seconds = 300
|
||||
description = "Low-frequency metrics (service status, disk usage)"
|
||||
- **Location:** `/var/lib/cm-dashboard/nixos-config`
|
||||
- **Purpose:** Production deployment only - agent clones/pulls from git
|
||||
- **Workflow:** git pull → `/var/lib/cm-dashboard/nixos-config` → nixos-rebuild
|
||||
|
||||
[cache.tiers.slow]
|
||||
interval_seconds = 900
|
||||
description = "Very low-frequency metrics (SMART data, backup status)"
|
||||
### Git Flow
|
||||
|
||||
# Metric assignments via configuration
|
||||
[cache.metric_assignments]
|
||||
"cpu_load_*" = "realtime"
|
||||
"service_*_disk_gb" = "medium"
|
||||
"disk_*_temperature" = "slow"
|
||||
```
|
||||
Development: ~/projects/cm-dashboard → git commit → git push
|
||||
Deployment: git pull → /var/lib/cm-dashboard/nixos-config → rebuild
|
||||
```
|
||||
|
||||
**Implementation:**
|
||||
- **ConfigurableCache**: Central cache manager reading tier config from files
|
||||
- **MetricCacheManager**: Assigns metrics to tiers based on configuration patterns
|
||||
- **TierScheduler**: Manages configurable tier-based refresh timing
|
||||
- **Cache warming**: Parallel startup population for instant responsiveness
|
||||
- **Background refresh**: Proactive updates based on configured intervals
|
||||
## Automated Binary Release System
|
||||
|
||||
**Configuration:**
|
||||
```toml
|
||||
[cache]
|
||||
enabled = true
|
||||
default_ttl_seconds = 30
|
||||
max_entries = 10000
|
||||
warming_timeout_seconds = 3
|
||||
background_refresh_enabled = true
|
||||
cleanup_interval_seconds = 1800
|
||||
```
|
||||
CM Dashboard uses automated binary releases instead of source builds.
|
||||
|
||||
**Performance Benefits:**
|
||||
- CPU usage reduction: 10% → <1% target through configuration optimization
|
||||
- Configurable cache intervals prevent expensive operations from running too frequently
|
||||
- Disk usage detection cached at 5-minute intervals instead of every 5 seconds
|
||||
- Selective metric refresh based on configured volatility patterns
|
||||
### Creating New Releases
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Start agent with config-based caching
|
||||
cm-dashboard-agent --config /etc/cm-dashboard/agent.toml [-v]
|
||||
cd ~/projects/cm-dashboard
|
||||
git tag v0.1.X
|
||||
git push origin v0.1.X
|
||||
```
|
||||
|
||||
**Architecture:**
|
||||
- **Configuration-driven caching**: Tiered collection with configurable intervals
|
||||
- **Config file management**: All cache behavior defined in TOML configuration
|
||||
- **Responsive design**: Cache warming for instant dashboard startup
|
||||
This automatically:
|
||||
|
||||
### New Implementation Guidelines - CRITICAL
|
||||
- Builds static binaries with `RUSTFLAGS="-C target-feature=+crt-static"`
|
||||
- Creates GitHub-style release with tarball
|
||||
- Uploads binaries via Gitea API
|
||||
|
||||
**ARCHITECTURE ENFORCEMENT**:
|
||||
- **ZERO legacy code reuse** - Fresh implementation following ARCHITECT.md exactly
|
||||
- **Individual metrics only** - NO grouped metric structures
|
||||
- **Reference-only legacy** - Study old functionality, implement new architecture
|
||||
- **Clean slate mindset** - Build as if legacy codebase never existed
|
||||
### NixOS Configuration Updates
|
||||
|
||||
**Implementation Rules**:
|
||||
1. **Individual Metrics**: Each metric is collected, transmitted, and stored individually
|
||||
2. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
||||
3. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
||||
4. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
||||
5. **ZMQ Communication**: All metrics transmitted via ZMQ, no HTTP APIs
|
||||
Edit `~/projects/nixosbox/hosts/services/cm-dashboard.nix`:
|
||||
|
||||
**When Adding New Metrics**:
|
||||
1. Define metric name in shared registry (e.g., "disk_nvme1_temperature_celsius")
|
||||
2. Implement collector that returns individual Metric struct
|
||||
3. Agent calculates status using configured thresholds
|
||||
4. Dashboard widgets subscribe to metric by name
|
||||
5. Notification system automatically detects status changes
|
||||
```nix
|
||||
version = "v0.1.X";
|
||||
src = pkgs.fetchurl {
|
||||
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/${version}/cm-dashboard-linux-x86_64.tar.gz";
|
||||
sha256 = "sha256-NEW_HASH_HERE";
|
||||
};
|
||||
```
|
||||
|
||||
**Testing & Building**:
|
||||
- **Workspace builds**: `cargo build --workspace` for all testing
|
||||
- **Clean compilation**: Remove `target/` between architecture changes
|
||||
- **ZMQ testing**: Test agent-dashboard communication independently
|
||||
- **Widget testing**: Verify UI layout matches legacy appearance exactly
|
||||
### Get Release Hash
|
||||
|
||||
**NEVER in New Implementation**:
|
||||
- Copy/paste ANY code from legacy backup
|
||||
- Create grouped metric structures (SystemMetrics, etc.)
|
||||
- Calculate status in dashboard widgets
|
||||
- Hardcode metric names in widgets (use const arrays)
|
||||
- Skip individual metric architecture for "simplicity"
|
||||
```bash
|
||||
cd ~/projects/nixosbox
|
||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/v0.1.X/cm-dashboard-linux-x86_64.tar.gz";
|
||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||
}' 2>&1 | grep "got:"
|
||||
```
|
||||
|
||||
**Legacy Reference Usage**:
|
||||
- Study UI layout and rendering logic only
|
||||
- Understand email notification formatting
|
||||
- Reference status color mapping
|
||||
- Learn host navigation patterns
|
||||
- NO code copying or structural influence
|
||||
### Building
|
||||
|
||||
# Important Communication Guidelines
|
||||
**Testing & Building:**
|
||||
|
||||
NEVER write that you have "successfully implemented" something or generate extensive summary text without first verifying with the user that the implementation is correct. This wastes tokens. Keep responses concise.
|
||||
- **Workspace builds**: `nix-shell -p openssl pkg-config --run "cargo build --workspace"`
|
||||
- **Clean compilation**: Remove `target/` between major changes
|
||||
|
||||
NEVER implement code without first getting explicit user agreement on the approach. Always ask for confirmation before proceeding with implementation.
|
||||
## Enhanced Storage Pool Visualization
|
||||
|
||||
### Auto-Discovery Architecture
|
||||
|
||||
The dashboard uses automatic storage discovery to eliminate manual configuration complexity while providing intelligent storage pool grouping.
|
||||
|
||||
### Discovery Process
|
||||
|
||||
**At Agent Startup:**
|
||||
|
||||
1. Parse `/proc/mounts` to identify all mounted filesystems
|
||||
2. Detect MergerFS pools by analyzing `fuse.mergerfs` mount sources
|
||||
3. Identify member disks and potential parity relationships via heuristics
|
||||
4. Store discovered storage topology for continuous monitoring
|
||||
5. Generate pool-aware metrics with hierarchical relationships
|
||||
|
||||
**Continuous Monitoring:**
|
||||
|
||||
- Use stored discovery data for efficient metric collection
|
||||
- Monitor individual drives for SMART data, temperature, wear
|
||||
- Calculate pool-level health based on member drive status
|
||||
- Generate enhanced metrics for dashboard visualization
|
||||
|
||||
### Supported Storage Types
|
||||
|
||||
**Single Disks:**
|
||||
|
||||
- ext4, xfs, btrfs mounted directly
|
||||
- Individual drive monitoring with SMART data
|
||||
- Traditional single-disk display for root, boot, etc.
|
||||
|
||||
**MergerFS Pools:**
|
||||
|
||||
- Auto-detect from `/proc/mounts` fuse.mergerfs entries
|
||||
- Parse source paths to identify member disks (e.g., "/mnt/disk1:/mnt/disk2")
|
||||
- Heuristic parity disk detection (sequential device names, "parity" in path)
|
||||
- Pool health calculation (healthy/degraded/critical)
|
||||
- Hierarchical tree display with data/parity disk grouping
|
||||
|
||||
**Future Extensions Ready:**
|
||||
|
||||
- RAID arrays via `/proc/mdstat` parsing
|
||||
- ZFS pools via `zpool status` integration
|
||||
- LVM logical volumes via `lvs` discovery
|
||||
|
||||
### Configuration
|
||||
|
||||
```toml
|
||||
[collectors.disk]
|
||||
enabled = true
|
||||
auto_discover = true # Default: true
|
||||
# Optional exclusions for special filesystems
|
||||
exclude_mount_points = ["/tmp", "/proc", "/sys", "/dev"]
|
||||
exclude_fs_types = ["tmpfs", "devtmpfs", "sysfs", "proc"]
|
||||
```
|
||||
|
||||
### Display Format
|
||||
|
||||
```
|
||||
Network:
|
||||
● eno1:
|
||||
├─ ip: 192.168.30.105
|
||||
└─ tailscale0: 100.125.108.16
|
||||
● eno2:
|
||||
└─ ip: 192.168.32.105
|
||||
CPU:
|
||||
● Load: 0.23 0.21 0.13
|
||||
└─ Freq: 1048 MHz
|
||||
RAM:
|
||||
● Usage: 25% 5.8GB/23.3GB
|
||||
├─ ● /tmp: 2% 0.5GB/2GB
|
||||
└─ ● /var/tmp: 0% 0GB/1.0GB
|
||||
Storage:
|
||||
● 844B9A25 T: 25C W: 4%
|
||||
├─ ● /: 55% 250.5GB/456.4GB
|
||||
└─ ● /boot: 26% 0.3GB/1.0GB
|
||||
● mergerfs /srv/media:
|
||||
├─ ● 63% 2355.2GB/3686.4GB
|
||||
├─ ● Data_1: WDZQ8H8D T: 28°C
|
||||
├─ ● Data_2: GGA04461 T: 28°C
|
||||
└─ ● Parity: WDZS8RY0 T: 29°C
|
||||
Backup:
|
||||
● WD-WCC7K1234567 T: 32°C W: 12%
|
||||
├─ Last: 2h ago (12.3GB)
|
||||
├─ Next: in 22h
|
||||
└─ ● Usage: 45% 678GB/1.5TB
|
||||
```
|
||||
|
||||
## Important Communication Guidelines
|
||||
|
||||
Keep responses concise and focused. Avoid extensive implementation summaries unless requested.
|
||||
|
||||
## Commit Message Guidelines
|
||||
|
||||
**NEVER mention:**
|
||||
|
||||
- Claude or any AI assistant names
|
||||
- Automation or AI-generated content
|
||||
- Any reference to automated code generation
|
||||
|
||||
**ALWAYS:**
|
||||
|
||||
- Focus purely on technical changes and their purpose
|
||||
- Use standard software development commit message format
|
||||
- Describe what was changed and why, not how it was created
|
||||
- Write from the perspective of a human developer
|
||||
|
||||
**Examples:**
|
||||
- ❌ "Generated with Claude Code"
|
||||
|
||||
- ❌ "Generated with Claude Code"
|
||||
- ❌ "AI-assisted implementation"
|
||||
- ❌ "Automated refactoring"
|
||||
- ✅ "Implement maintenance mode for backup operations"
|
||||
- ✅ "Restructure storage widget with improved layout"
|
||||
- ✅ "Update CPU thresholds to production values"
|
||||
|
||||
## NixOS Configuration Updates
|
||||
## Implementation Rules
|
||||
|
||||
When code changes are made to cm-dashboard, the NixOS configuration at `~/nixosbox` must be updated to deploy the changes.
|
||||
1. **Agent Status Authority**: Agent calculates status for each metric using thresholds
|
||||
2. **Dashboard Composition**: Dashboard widgets subscribe to specific metrics by name
|
||||
3. **Status Aggregation**: Dashboard aggregates individual metric statuses for widget status
|
||||
|
||||
### Update Process
|
||||
**NEVER:**
|
||||
|
||||
1. **Get Latest Commit Hash**
|
||||
```bash
|
||||
git log -1 --format="%H"
|
||||
```
|
||||
- Copy/paste ANY code from legacy implementations
|
||||
- Calculate status in dashboard widgets
|
||||
- Hardcode metric names in widgets (use const arrays)
|
||||
- Create files unless absolutely necessary for achieving goals
|
||||
- Create documentation files unless explicitly requested
|
||||
|
||||
2. **Update NixOS Configuration**
|
||||
Edit `~/nixosbox/hosts/common/cm-dashboard.nix`:
|
||||
```nix
|
||||
src = pkgs.fetchgit {
|
||||
url = "https://gitea.cmtec.se/cm/cm-dashboard.git";
|
||||
rev = "NEW_COMMIT_HASH_HERE";
|
||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="; # Placeholder
|
||||
};
|
||||
```
|
||||
**ALWAYS:**
|
||||
|
||||
3. **Get Correct Source Hash**
|
||||
Build with placeholder hash to get the actual hash:
|
||||
```bash
|
||||
cd ~/nixosbox
|
||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchgit {
|
||||
url = "https://gitea.cmtec.se/cm/cm-dashboard.git";
|
||||
rev = "NEW_COMMIT_HASH";
|
||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||
}' 2>&1 | grep "got:"
|
||||
```
|
||||
|
||||
Example output:
|
||||
```
|
||||
error: hash mismatch in fixed-output derivation '/nix/store/...':
|
||||
specified: sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=
|
||||
got: sha256-x8crxNusOUYRrkP9mYEOG+Ga3JCPIdJLkEAc5P1ZxdQ=
|
||||
```
|
||||
|
||||
4. **Update Configuration with Correct Hash**
|
||||
Replace the placeholder with the hash from the error message (the "got:" line).
|
||||
|
||||
5. **Commit NixOS Configuration**
|
||||
```bash
|
||||
cd ~/nixosbox
|
||||
git add hosts/common/cm-dashboard.nix
|
||||
git commit -m "Update cm-dashboard to latest version (SHORT_HASH)"
|
||||
git push
|
||||
```
|
||||
|
||||
6. **Rebuild System**
|
||||
The user handles the system rebuild step - this cannot be automated.
|
||||
- Prefer editing existing files to creating new ones
|
||||
- Follow existing code conventions and patterns
|
||||
- Use existing libraries and utilities
|
||||
- Follow security best practices
|
||||
|
||||
391
Cargo.lock
generated
391
Cargo.lock
generated
@@ -17,9 +17,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
@@ -71,22 +71,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.4"
|
||||
version = "1.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2"
|
||||
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
||||
dependencies = [
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.10"
|
||||
version = "3.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a"
|
||||
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"once_cell_polyfill",
|
||||
"windows-sys 0.60.2",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -95,6 +95,15 @@ version = "1.0.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
|
||||
|
||||
[[package]]
|
||||
name = "ar_archive_writer"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a"
|
||||
dependencies = [
|
||||
"object",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.89"
|
||||
@@ -132,9 +141,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.9.4"
|
||||
version = "2.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
|
||||
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
@@ -144,9 +153,9 @@ checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.10.1"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
|
||||
|
||||
[[package]]
|
||||
name = "cassowary"
|
||||
@@ -156,9 +165,9 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.41"
|
||||
version = "1.2.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7"
|
||||
checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"jobserver",
|
||||
@@ -178,9 +187,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.3"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
@@ -196,6 +205,28 @@ dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono-tz"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"chrono-tz-build",
|
||||
"phf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono-tz-build"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f"
|
||||
dependencies = [
|
||||
"parse-zoneinfo",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chumsky"
|
||||
version = "0.9.3"
|
||||
@@ -208,9 +239,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.49"
|
||||
version = "4.5.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f"
|
||||
checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
@@ -218,9 +249,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.49"
|
||||
version = "4.5.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730"
|
||||
checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
@@ -248,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.0"
|
||||
version = "0.1.183"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@@ -264,16 +295,18 @@ dependencies = [
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"wake-on-lan",
|
||||
"zmq",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.0"
|
||||
version = "0.1.183"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"chrono",
|
||||
"chrono-tz",
|
||||
"clap",
|
||||
"cm-dashboard-shared",
|
||||
"gethostname",
|
||||
@@ -291,7 +324,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.0"
|
||||
version = "0.1.183"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
@@ -383,7 +416,7 @@ version = "0.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
|
||||
dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"bitflags 2.10.0",
|
||||
"crossterm_winapi",
|
||||
"libc",
|
||||
"mio 0.8.11",
|
||||
@@ -479,9 +512,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.4"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
|
||||
checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844"
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
@@ -579,14 +612,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.3"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
||||
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.7+wasi-0.2.4",
|
||||
"wasip2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -744,9 +777,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "icu_collections"
|
||||
version = "2.0.0"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47"
|
||||
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"potential_utf",
|
||||
@@ -757,9 +790,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "icu_locale_core"
|
||||
version = "2.0.0"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a"
|
||||
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"litemap",
|
||||
@@ -770,11 +803,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer"
|
||||
version = "2.0.0"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"
|
||||
checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_normalizer_data",
|
||||
"icu_properties",
|
||||
@@ -785,42 +817,38 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer_data"
|
||||
version = "2.0.0"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
|
||||
checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties"
|
||||
version = "2.0.1"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b"
|
||||
checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_locale_core",
|
||||
"icu_properties_data",
|
||||
"icu_provider",
|
||||
"potential_utf",
|
||||
"zerotrie",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties_data"
|
||||
version = "2.0.1"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632"
|
||||
checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899"
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider"
|
||||
version = "2.0.0"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af"
|
||||
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locale_core",
|
||||
"stable_deref_trait",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
@@ -851,9 +879,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.11.4"
|
||||
version = "2.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5"
|
||||
checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.16.0",
|
||||
@@ -861,9 +889,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "indoc"
|
||||
version = "2.0.6"
|
||||
version = "2.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
|
||||
checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
|
||||
dependencies = [
|
||||
"rustversion",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
@@ -873,9 +904,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
@@ -904,9 +935,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.81"
|
||||
version = "0.3.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305"
|
||||
checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
@@ -964,9 +995,9 @@ checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
|
||||
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
@@ -1021,19 +1052,19 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
||||
"wasi",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "1.0.4"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
|
||||
checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
||||
"windows-sys 0.59.0",
|
||||
"wasi",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1080,6 +1111,15 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.32.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
@@ -1088,17 +1128,17 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell_polyfill"
|
||||
version = "1.70.1"
|
||||
version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
||||
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.74"
|
||||
version = "0.10.75"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24ad14dd45412269e1a30f52ad8f0664f0f4f4a89ee8fe28c3b3527021ebb654"
|
||||
checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328"
|
||||
dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"bitflags 2.10.0",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
@@ -1126,9 +1166,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.110"
|
||||
version = "0.9.111"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a9f0075ba3c21b09f8e8b2026584b1d18d49388648f2fbbf3c97ea8deced8e2"
|
||||
checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
@@ -1159,6 +1199,15 @@ dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse-zoneinfo"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24"
|
||||
dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "paste"
|
||||
version = "1.0.15"
|
||||
@@ -1171,6 +1220,44 @@ version = "2.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.16"
|
||||
@@ -1191,36 +1278,37 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a"
|
||||
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
|
||||
dependencies = [
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.101"
|
||||
version = "1.0.103"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
|
||||
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "psm"
|
||||
version = "0.1.27"
|
||||
version = "0.1.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e66fcd288453b748497d8fb18bccc83a16b0518e3906d4b8df0a8d42d93dbb1c"
|
||||
checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01"
|
||||
dependencies = [
|
||||
"ar_archive_writer",
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.41"
|
||||
version = "1.0.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
|
||||
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
@@ -1237,13 +1325,28 @@ version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
|
||||
[[package]]
|
||||
name = "ratatui"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ebc917cfb527a566c37ecb94c7e3fd098353516fb4eb6bea17015ade0182425"
|
||||
dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"bitflags 2.10.0",
|
||||
"cassowary",
|
||||
"crossterm",
|
||||
"indoc",
|
||||
@@ -1281,7 +1384,19 @@ version = "0.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
||||
dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"bitflags 2.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1347,7 +1462,7 @@ version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
|
||||
dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"bitflags 2.10.0",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
@@ -1405,7 +1520,7 @@ version = "2.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
|
||||
dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"bitflags 2.10.0",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
@@ -1513,9 +1628,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-mio"
|
||||
version = "0.2.4"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd"
|
||||
checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"mio 0.8.11",
|
||||
@@ -1531,6 +1646,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.11"
|
||||
@@ -1612,9 +1733,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.106"
|
||||
version = "2.0.110"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
|
||||
checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1722,9 +1843,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.8.1"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b"
|
||||
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"zerovec",
|
||||
@@ -1738,7 +1859,7 @@ checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"libc",
|
||||
"mio 1.0.4",
|
||||
"mio 1.1.0",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
@@ -1770,9 +1891,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.16"
|
||||
version = "0.7.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5"
|
||||
checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
@@ -1897,9 +2018,9 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.19"
|
||||
version = "1.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"
|
||||
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
@@ -1951,9 +2072,9 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version-compare"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b"
|
||||
checksum = "03c2856837ef78f57382f06b2b8563a2f512f7185d732608fd9176cb3b8edf0e"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
@@ -1961,6 +2082,12 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "wake-on-lan"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ccf60b60ad7e5b1b37372c5134cbcab4db0706c231d212e0c643a077462bc8f"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.5.0"
|
||||
@@ -1986,15 +2113,6 @@ version = "0.11.1+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.7+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
|
||||
dependencies = [
|
||||
"wasip2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasip2"
|
||||
version = "1.0.1+wasi-0.2.4"
|
||||
@@ -2006,9 +2124,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.104"
|
||||
version = "0.2.105"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d"
|
||||
checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
@@ -2017,25 +2135,11 @@ dependencies = [
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.104"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.54"
|
||||
version = "0.4.55"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c"
|
||||
checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
@@ -2046,9 +2150,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.104"
|
||||
version = "0.2.105"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119"
|
||||
checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
@@ -2056,31 +2160,31 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.104"
|
||||
version = "0.2.105"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
|
||||
checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.104"
|
||||
version = "0.2.105"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1"
|
||||
checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.81"
|
||||
version = "0.3.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120"
|
||||
checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
@@ -2434,17 +2538,16 @@ checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.6.1"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
|
||||
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc"
|
||||
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
"yoke-derive",
|
||||
"zerofrom",
|
||||
@@ -2452,9 +2555,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "yoke-derive"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
|
||||
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -2515,9 +2618,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zerotrie"
|
||||
version = "0.2.2"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595"
|
||||
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"yoke",
|
||||
@@ -2526,9 +2629,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
version = "0.11.4"
|
||||
version = "0.11.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b"
|
||||
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
|
||||
dependencies = [
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
@@ -2537,9 +2640,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zerovec-derive"
|
||||
version = "0.11.1"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
|
||||
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
||||
745
README.md
745
README.md
@@ -1,544 +1,365 @@
|
||||
# CM Dashboard - Infrastructure Monitoring TUI
|
||||
# CM Dashboard
|
||||
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built to replace Glance with a custom solution tailored for specific monitoring needs and API integrations. Features real-time monitoring of all infrastructure components with intelligent email notifications and automatic status calculation.
|
||||
A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. Built with ZMQ-based metric collection and individual metrics architecture.
|
||||
|
||||
### System Widget
|
||||
```
|
||||
┌System───────────────────────────────────────────────────────┐
|
||||
│ Memory usage │
|
||||
│✔ 3.0 / 7.8 GB │
|
||||
│ CPU load CPU temp │
|
||||
│✔ 1.05 • 0.96 • 0.58 64.0°C │
|
||||
│ C1E C3 C6 C8 C9 C10 │
|
||||
│✔ 0.5% 0.5% 10.4% 10.2% 0.4% 77.9% │
|
||||
│ GPU load GPU temp │
|
||||
│✔ — — │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
## Features
|
||||
|
||||
### Services Widget (Enhanced)
|
||||
```
|
||||
┌Services────────────────────────────────────────────────────┐
|
||||
│ Service Memory (GB) CPU Disk │
|
||||
│✔ Service Memory 7.1/23899.7 MiB — │
|
||||
│✔ Disk Usage — — 45/100 GB │
|
||||
│⚠ CPU Load — 2.18 — │
|
||||
│✔ CPU Temperature — 47.0°C — │
|
||||
│✔ docker-registry 0.0 GB 0.0% <1 MB │
|
||||
│✔ gitea 0.4/4.1 GB 0.2% 970 MB │
|
||||
│ 1 active connections │
|
||||
│✔ nginx 0.0/1.0 GB 0.0% <1 MB │
|
||||
│✔ ├─ docker.cmtec.se │
|
||||
│✔ ├─ git.cmtec.se │
|
||||
│✔ ├─ gitea.cmtec.se │
|
||||
│✔ ├─ haasp.cmtec.se │
|
||||
│✔ ├─ pages.cmtec.se │
|
||||
│✔ └─ www.kryddorten.se │
|
||||
│✔ postgresql 0.1 GB 0.0% 378 MB │
|
||||
│ 1 active connections │
|
||||
│✔ redis-immich 0.0 GB 0.4% <1 MB │
|
||||
│✔ sshd 0.0 GB 0.0% <1 MB │
|
||||
│ 1 SSH connection │
|
||||
│✔ unifi 0.9/2.0 GB 0.4% 391 MB │
|
||||
└────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
### Core Monitoring
|
||||
- **Real-time metrics**: CPU, RAM, Storage, and Service status
|
||||
- **Multi-host support**: Monitor multiple servers from single dashboard
|
||||
- **Service management**: Start/stop services with intelligent status tracking
|
||||
- **NixOS integration**: System rebuild via SSH + tmux popup
|
||||
- **Backup monitoring**: Borgbackup status and scheduling
|
||||
- **Email notifications**: Intelligent batching prevents spam
|
||||
|
||||
### Storage Widget
|
||||
```
|
||||
┌Storage──────────────────────────────────────────────────────┐
|
||||
│ Drive Temp Wear Spare Hours Capacity Usage │
|
||||
│✔ nvme0n1 57°C 4% 100% 11463 932G 23G (2%) │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
### User-Stopped Service Tracking
|
||||
Services stopped via the dashboard are intelligently tracked to prevent false alerts:
|
||||
|
||||
### Backups Widget
|
||||
```
|
||||
┌Backups──────────────────────────────────────────────────────┐
|
||||
│ Backup Status Details │
|
||||
│✔ Latest 3h ago 1.4 GiB │
|
||||
│ 8 archives, 2.4 GiB total │
|
||||
│✔ Disk ok 2.4/468 GB (1%) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Hosts Widget
|
||||
```
|
||||
┌Hosts────────────────────────────────────────────────────────┐
|
||||
│ Host Status Timestamp │
|
||||
│✔ cmbox ok 2025-10-13 05:45:28 │
|
||||
│✔ srv01 ok 2025-10-13 05:45:28 │
|
||||
│? labbox No data received — │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Navigation**: `←→` hosts, `r` refresh, `q` quit
|
||||
|
||||
## Key Features
|
||||
|
||||
### Real-time Monitoring
|
||||
- **Multi-host support** for cmbox, labbox, simonbox, steambox, srv01
|
||||
- **Performance-focused** with minimal resource usage
|
||||
- **Keyboard-driven interface** for power users
|
||||
- **ZMQ gossip network** for efficient data distribution
|
||||
|
||||
### Infrastructure Monitoring
|
||||
- **NVMe health monitoring** with wear prediction and temperature tracking
|
||||
- **CPU/Memory/GPU telemetry** with automatic thresholding
|
||||
- **Service resource monitoring** with per-service CPU and RAM usage
|
||||
- **Disk usage overview** for root filesystems
|
||||
- **Backup status** with detailed metrics and history
|
||||
- **C-state monitoring** for CPU power management analysis
|
||||
|
||||
### Intelligent Alerting
|
||||
- **Agent-calculated status** with predefined thresholds
|
||||
- **Email notifications** via SMTP with rate limiting
|
||||
- **Recovery notifications** with context about original issues
|
||||
- **Stockholm timezone** support for email timestamps
|
||||
- **Unified alert pipeline** summarizing host health
|
||||
- **Smart status reporting**: User-stopped services show as Status::OK instead of Warning
|
||||
- **Persistent storage**: Tracking survives agent restarts via JSON storage
|
||||
- **Automatic management**: Flags cleared when services restarted via dashboard
|
||||
- **Maintenance friendly**: No false alerts during intentional service operations
|
||||
|
||||
## Architecture
|
||||
|
||||
### Agent-Dashboard Separation
|
||||
The system follows a strict separation of concerns:
|
||||
### Individual Metrics Philosophy
|
||||
- **Agent**: Collects individual metrics, calculates status using thresholds
|
||||
- **Dashboard**: Subscribes to specific metrics, composes widgets from individual data
|
||||
- **ZMQ Communication**: Efficient real-time metric transmission
|
||||
- **Status Aggregation**: Host-level status calculated from all service metrics
|
||||
|
||||
- **Agent**: Single source of truth for all status calculations using defined thresholds
|
||||
- **Dashboard**: Display-only interface that shows agent-provided status
|
||||
- **Data Flow**: Agent (calculations) → Status → Dashboard (display) → Colors
|
||||
### Components
|
||||
|
||||
### Agent Thresholds (Production)
|
||||
- **CPU Load**: Warning ≥ 5.0, Critical ≥ 8.0
|
||||
- **Memory Usage**: Warning ≥ 80%, Critical ≥ 95%
|
||||
- **CPU Temperature**: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled)
|
||||
|
||||
### Email Notification System
|
||||
- **From**: `{hostname}@cmtec.se` (e.g., cmbox@cmtec.se)
|
||||
- **To**: `cm@cmtec.se`
|
||||
- **SMTP**: localhost:25 (postfix)
|
||||
- **Rate Limiting**: 30 minutes (configurable)
|
||||
- **Triggers**: Status degradation and recovery with detailed context
|
||||
|
||||
## Installation
|
||||
|
||||
### Requirements
|
||||
- Rust toolchain 1.75+ (install via [`rustup`](https://rustup.rs))
|
||||
- Root privileges for agent (hardware monitoring access)
|
||||
- Network access for ZMQ communication (default port 6130)
|
||||
- SMTP server for notifications (postfix recommended)
|
||||
|
||||
### Build from Source
|
||||
```bash
|
||||
git clone https://github.com/cmtec/cm-dashboard.git
|
||||
cd cm-dashboard
|
||||
cargo build --release
|
||||
```
|
||||
┌─────────────────┐ ZMQ ┌─────────────────┐
|
||||
│ │◄──────────►│ │
|
||||
│ Agent │ Metrics │ Dashboard │
|
||||
│ - Collectors │ │ - TUI │
|
||||
│ - Status │ │ - Widgets │
|
||||
│ - Tracking │ │ - Commands │
|
||||
│ │ │ │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ JSON Storage │ │ SSH + tmux │
|
||||
│ - User-stopped │ │ - Remote rebuild│
|
||||
│ - Cache │ │ - Process │
|
||||
│ - State │ │ isolation │
|
||||
└─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
Optimized binaries available at:
|
||||
- Dashboard: `target/release/cm-dashboard`
|
||||
- Agent: `target/release/cm-dashboard-agent`
|
||||
### Service Control Flow
|
||||
|
||||
### Installation
|
||||
```bash
|
||||
# Install dashboard
|
||||
cargo install --path dashboard
|
||||
1. **User Action**: Dashboard sends `UserStart`/`UserStop` commands
|
||||
2. **Agent Processing**:
|
||||
- Marks service as user-stopped (if stopping)
|
||||
- Executes `systemctl start/stop service`
|
||||
- Syncs state to global tracker
|
||||
3. **Status Calculation**:
|
||||
- Systemd collector checks user-stopped flag
|
||||
- Reports Status::OK for user-stopped inactive services
|
||||
- Normal Warning status for system failures
|
||||
|
||||
## Interface
|
||||
|
||||
# Install agent (requires root for hardware access)
|
||||
sudo cargo install --path agent
|
||||
```
|
||||
cm-dashboard • ● cmbox ● srv01 ● srv02 ● steambox
|
||||
┌system──────────────────────────────┐┌services─────────────────────────────────────────┐
|
||||
│NixOS: ││Service: Status: RAM: Disk: │
|
||||
│Build: 25.05.20251004.3bcc93c ││● docker active 27M 496MB │
|
||||
│Agent: v0.1.43 ││● gitea active 579M 2.6GB │
|
||||
│Active users: cm, simon ││● nginx active 28M 24MB │
|
||||
│CPU: ││ ├─ ● gitea.cmtec.se 51ms │
|
||||
│● Load: 0.10 0.52 0.88 • 3000MHz ││ ├─ ● photos.cmtec.se 41ms │
|
||||
│RAM: ││● postgresql active 112M 357MB │
|
||||
│● Usage: 33% 2.6GB/7.6GB ││● redis-immich user-stopped │
|
||||
│● /tmp: 0% 0B/2.0GB ││● sshd active 2M 0 │
|
||||
│Storage: ││● unifi active 594M 495MB │
|
||||
│● root (Single): ││ │
|
||||
│ ├─ ● nvme0n1 W: 1% ││ │
|
||||
│ └─ ● 18% 167.4GB/928.2GB ││ │
|
||||
└────────────────────────────────────┘└─────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Navigation
|
||||
- **Tab**: Switch between hosts
|
||||
- **↑↓ or j/k**: Navigate services
|
||||
- **s**: Start selected service (UserStart)
|
||||
- **S**: Stop selected service (UserStop)
|
||||
- **J**: Show service logs (journalctl in tmux popup)
|
||||
- **L**: Show custom log files (tail -f custom paths in tmux popup)
|
||||
- **R**: Rebuild current host
|
||||
- **B**: Run backup on current host
|
||||
- **q**: Quit
|
||||
|
||||
### Status Indicators
|
||||
- **Green ●**: Active service
|
||||
- **Yellow ◐**: Inactive service (system issue)
|
||||
- **Red ◯**: Failed service
|
||||
- **Blue arrows**: Service transitioning (↑ starting, ↓ stopping, ↻ restarting)
|
||||
- **"user-stopped"**: Service stopped via dashboard (Status::OK)
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Dashboard
|
||||
```bash
|
||||
# Run with default configuration
|
||||
cm-dashboard
|
||||
|
||||
# Specify host to monitor
|
||||
cm-dashboard --host cmbox
|
||||
|
||||
# Override ZMQ endpoints
|
||||
cm-dashboard --zmq-endpoint tcp://srv01:6130,tcp://labbox:6130
|
||||
|
||||
# Increase logging verbosity
|
||||
cm-dashboard -v
|
||||
```
|
||||
|
||||
### Agent (Pure Auto-Discovery)
|
||||
The agent requires **no configuration files** and auto-discovers all system components:
|
||||
### Building
|
||||
|
||||
```bash
|
||||
# Basic agent startup (auto-detects everything)
|
||||
sudo cm-dashboard-agent
|
||||
# With Nix (recommended)
|
||||
nix-shell -p openssl pkg-config --run "cargo build --workspace"
|
||||
|
||||
# With verbose logging for troubleshooting
|
||||
sudo cm-dashboard-agent -v
|
||||
# Or with system dependencies
|
||||
sudo apt install libssl-dev pkg-config # Ubuntu/Debian
|
||||
cargo build --workspace
|
||||
```
|
||||
|
||||
The agent automatically:
|
||||
- **Discovers storage devices** for SMART monitoring
|
||||
- **Detects running systemd services** for resource tracking
|
||||
- **Configures collection intervals** based on system capabilities
|
||||
- **Sets up email notifications** using hostname@cmtec.se
|
||||
### Running
|
||||
|
||||
```bash
|
||||
# Start agent (requires configuration)
|
||||
./target/debug/cm-dashboard-agent --config /etc/cm-dashboard/agent.toml
|
||||
|
||||
# Start dashboard (inside tmux session)
|
||||
tmux
|
||||
./target/debug/cm-dashboard --config /etc/cm-dashboard/dashboard.toml
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Dashboard Configuration
|
||||
The dashboard creates `config/dashboard.toml` on first run:
|
||||
### Agent Configuration
|
||||
|
||||
```toml
|
||||
[hosts]
|
||||
default_host = "srv01"
|
||||
collection_interval_seconds = 2
|
||||
|
||||
[[hosts.hosts]]
|
||||
name = "srv01"
|
||||
[zmq]
|
||||
publisher_port = 6130
|
||||
command_port = 6131
|
||||
bind_address = "0.0.0.0"
|
||||
transmission_interval_seconds = 2
|
||||
|
||||
[collectors.cpu]
|
||||
enabled = true
|
||||
interval_seconds = 2
|
||||
load_warning_threshold = 5.0
|
||||
load_critical_threshold = 10.0
|
||||
|
||||
[[hosts.hosts]]
|
||||
name = "cmbox"
|
||||
[collectors.memory]
|
||||
enabled = true
|
||||
interval_seconds = 2
|
||||
usage_warning_percent = 80.0
|
||||
usage_critical_percent = 90.0
|
||||
|
||||
[dashboard]
|
||||
tick_rate_ms = 250
|
||||
history_duration_minutes = 60
|
||||
[collectors.systemd]
|
||||
enabled = true
|
||||
interval_seconds = 10
|
||||
service_name_filters = ["nginx*", "postgresql*", "docker*", "sshd*"]
|
||||
excluded_services = ["nginx-config-reload", "systemd-", "getty@"]
|
||||
nginx_latency_critical_ms = 1000.0
|
||||
http_timeout_seconds = 10
|
||||
|
||||
[data_source]
|
||||
kind = "zmq"
|
||||
|
||||
[data_source.zmq]
|
||||
endpoints = ["tcp://127.0.0.1:6130"]
|
||||
[notifications]
|
||||
enabled = true
|
||||
smtp_host = "localhost"
|
||||
smtp_port = 25
|
||||
from_email = "{hostname}@example.com"
|
||||
to_email = "admin@example.com"
|
||||
aggregation_interval_seconds = 30
|
||||
```
|
||||
|
||||
### Agent Configuration (Optional)
|
||||
The agent works without configuration but supports optional settings:
|
||||
### Dashboard Configuration
|
||||
|
||||
```toml
|
||||
[zmq]
|
||||
subscriber_ports = [6130]
|
||||
|
||||
[hosts]
|
||||
predefined_hosts = ["cmbox", "srv01", "srv02"]
|
||||
|
||||
[ssh]
|
||||
rebuild_user = "cm"
|
||||
rebuild_alias = "nixos-rebuild-cmtec"
|
||||
backup_alias = "cm-backup-run"
|
||||
```
|
||||
|
||||
## Technical Implementation
|
||||
|
||||
### Collectors
|
||||
|
||||
#### Systemd Collector
|
||||
- **Service Discovery**: Uses `systemctl list-unit-files` + `list-units --all`
|
||||
- **Status Calculation**: Checks user-stopped flag before assigning Warning status
|
||||
- **Memory Tracking**: Per-service memory usage via `systemctl show`
|
||||
- **Sub-services**: Nginx site latency, Docker containers
|
||||
- **User-stopped Integration**: `UserStoppedServiceTracker::is_service_user_stopped()`
|
||||
|
||||
#### User-Stopped Service Tracker
|
||||
- **Storage**: `/var/lib/cm-dashboard/user-stopped-services.json`
|
||||
- **Thread Safety**: Global singleton with `Arc<Mutex<>>`
|
||||
- **Persistence**: Automatic save on state changes
|
||||
- **Global Access**: Static methods for collector integration
|
||||
|
||||
#### Other Collectors
|
||||
- **CPU**: Load average, temperature, frequency monitoring
|
||||
- **Memory**: RAM/swap usage, tmpfs monitoring
|
||||
- **Disk**: Filesystem usage, SMART health data
|
||||
- **NixOS**: Build version, active users, agent version
|
||||
- **Backup**: Borgbackup repository status and metrics
|
||||
|
||||
### ZMQ Protocol
|
||||
|
||||
```rust
|
||||
// Metric Message
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct MetricMessage {
|
||||
pub hostname: String,
|
||||
pub timestamp: u64,
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
// Service Commands
|
||||
pub enum AgentCommand {
|
||||
ServiceControl {
|
||||
service_name: String,
|
||||
action: ServiceAction,
|
||||
},
|
||||
SystemRebuild { /* SSH config */ },
|
||||
CollectNow,
|
||||
}
|
||||
|
||||
pub enum ServiceAction {
|
||||
Start, // System-initiated
|
||||
Stop, // System-initiated
|
||||
UserStart, // User via dashboard (clears user-stopped)
|
||||
UserStop, // User via dashboard (marks user-stopped)
|
||||
Status,
|
||||
}
|
||||
```
|
||||
|
||||
### Maintenance Mode
|
||||
|
||||
Suppress notifications during planned maintenance:
|
||||
|
||||
```bash
|
||||
# Generate example configuration
|
||||
cm-dashboard-agent --help
|
||||
# Enable maintenance mode
|
||||
touch /tmp/cm-maintenance
|
||||
|
||||
# Override specific settings
|
||||
sudo cm-dashboard-agent \
|
||||
--hostname cmbox \
|
||||
--bind tcp://*:6130 \
|
||||
--interval 5000
|
||||
# Perform maintenance
|
||||
systemctl stop service
|
||||
# ... work ...
|
||||
systemctl start service
|
||||
|
||||
# Disable maintenance mode
|
||||
rm /tmp/cm-maintenance
|
||||
```
|
||||
|
||||
## Widget Layout
|
||||
|
||||
### Services Widget Structure
|
||||
The Services widget now displays both system metrics and services in a unified table:
|
||||
|
||||
```
|
||||
┌Services────────────────────────────────────────────────────┐
|
||||
│ Service Memory (GB) CPU Disk │
|
||||
│✔ Service Memory 7.1/23899.7 MiB — │ ← System metric as service row
|
||||
│✔ Disk Usage — — 45/100 GB │ ← System metric as service row
|
||||
│⚠ CPU Load — 2.18 — │ ← System metric as service row
|
||||
│✔ CPU Temperature — 47.0°C — │ ← System metric as service row
|
||||
│✔ docker-registry 0.0 GB 0.0% <1 MB │ ← Regular service
|
||||
│✔ nginx 0.0/1.0 GB 0.0% <1 MB │ ← Regular service
|
||||
│✔ ├─ docker.cmtec.se │ ← Nginx site (sub-service)
|
||||
│✔ ├─ git.cmtec.se │ ← Nginx site (sub-service)
|
||||
│✔ └─ gitea.cmtec.se │ ← Nginx site (sub-service)
|
||||
│✔ sshd 0.0 GB 0.0% <1 MB │ ← Regular service
|
||||
│ 1 SSH connection │ ← Service description
|
||||
└────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Row Types:**
|
||||
- **System Metrics**: CPU Load, Service Memory, Disk Usage, CPU Temperature with status indicators
|
||||
- **Regular Services**: Full resource data (memory, CPU, disk) with optional description lines
|
||||
- **Sub-services**: Nginx sites with tree structure, status indicators only (no resource columns)
|
||||
- **Description Lines**: Connection counts and service-specific info without status indicators
|
||||
|
||||
### Hosts Widget (formerly Alerts)
|
||||
The Hosts widget provides a summary view of all monitored hosts:
|
||||
|
||||
```
|
||||
┌Hosts────────────────────────────────────────────────────────┐
|
||||
│ Host Status Timestamp │
|
||||
│✔ cmbox ok 2025-10-13 05:45:28 │
|
||||
│✔ srv01 ok 2025-10-13 05:45:28 │
|
||||
│? labbox No data received — │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Monitoring Components
|
||||
|
||||
### System Collector
|
||||
- **CPU Load**: 1/5/15 minute averages with warning/critical thresholds
|
||||
- **Memory Usage**: Used/total with percentage calculation
|
||||
- **CPU Temperature**: x86_pkg_temp prioritized for accuracy
|
||||
- **C-States**: Power management state distribution (C0-C10)
|
||||
|
||||
### Service Collector
|
||||
- **System Metrics as Services**: CPU Load, Service Memory, Disk Usage, CPU Temperature displayed as individual service rows
|
||||
- **Systemd Services**: Auto-discovery of interesting services with resource monitoring
|
||||
- **Nginx Site Monitoring**: Individual rows for each nginx virtual host with tree structure (`├─` and `└─`)
|
||||
- **Resource Usage**: Per-service memory, CPU, and disk consumption
|
||||
- **Service Health**: Running/stopped/degraded status with detailed failure info
|
||||
- **Connection Tracking**: SSH connections, database connections as description lines
|
||||
|
||||
### SMART Collector
|
||||
- **NVMe Health**: Temperature, wear leveling, spare blocks
|
||||
- **Drive Capacity**: Total/used space with percentage
|
||||
- **SMART Attributes**: Critical health indicators
|
||||
|
||||
### Backup Collector
|
||||
- **Restic Integration**: Backup status and history
|
||||
- **Health Monitoring**: Success/failure tracking
|
||||
- **Storage Metrics**: Backup size and retention
|
||||
|
||||
## Keyboard Controls
|
||||
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `←` / `h` | Previous host |
|
||||
| `→` / `l` / `Tab` | Next host |
|
||||
| `?` | Toggle help overlay |
|
||||
| `r` | Force refresh |
|
||||
| `q` / `Esc` | Quit |
|
||||
|
||||
## Email Notifications
|
||||
|
||||
### Notification Triggers
|
||||
- **Status Degradation**: Any status change to warning/critical
|
||||
- **Recovery**: Warning/critical status returning to ok
|
||||
- **Service Failures**: Individual service stop/start events
|
||||
### Intelligent Batching
|
||||
- **Real-time dashboard**: Immediate status updates
|
||||
- **Batched emails**: Aggregated every 30 seconds
|
||||
- **Smart grouping**: Services organized by severity
|
||||
- **Recovery suppression**: Reduces notification spam
|
||||
|
||||
### Example Recovery Email
|
||||
### Example Alert
|
||||
```
|
||||
✅ RESOLVED: system cpu on cmbox
|
||||
Subject: Status Alert: 1 critical, 2 warnings, 0 recoveries
|
||||
|
||||
Status Change Alert
|
||||
Status Summary (30s duration)
|
||||
Host Status: Ok → Warning
|
||||
|
||||
Host: cmbox
|
||||
Component: system
|
||||
Metric: cpu
|
||||
Status Change: warning → ok
|
||||
Time: 2025-10-12 22:15:30 CET
|
||||
🔴 CRITICAL ISSUES (1):
|
||||
postgresql: Ok → Critical (memory usage 95%)
|
||||
|
||||
Details:
|
||||
Recovered from: CPU load (1/5/15min): 6.20 / 5.80 / 4.50
|
||||
Current status: CPU load (1/5/15min): 3.30 / 3.17 / 2.84
|
||||
🟡 WARNINGS (2):
|
||||
nginx: Ok → Warning (high load 8.5)
|
||||
redis: user-stopped → Warning (restarted by system)
|
||||
|
||||
✅ RECOVERIES (0):
|
||||
|
||||
--
|
||||
CM Dashboard Agent
|
||||
Generated at 2025-10-12 22:15:30 CET
|
||||
CM Dashboard Agent v0.1.43
|
||||
```
|
||||
|
||||
### Rate Limiting
|
||||
- **Default**: 30 minutes between notifications per component
|
||||
- **Testing**: Set to 0 for immediate notifications
|
||||
- **Configurable**: Adjustable per deployment needs
|
||||
|
||||
## Development
|
||||
|
||||
### Project Structure
|
||||
```
|
||||
cm-dashboard/
|
||||
├── agent/ # Monitoring agent
|
||||
├── agent/ # Metrics collection agent
|
||||
│ ├── src/
|
||||
│ │ ├── collectors/ # Data collection modules
|
||||
│ │ ├── notifications.rs # Email notification system
|
||||
│ │ └── simple_agent.rs # Main agent logic
|
||||
├── dashboard/ # TUI dashboard
|
||||
│ │ ├── collectors/ # CPU, memory, disk, systemd, backup, nixos
|
||||
│ │ ├── service_tracker.rs # User-stopped service tracking
|
||||
│ │ ├── status/ # Status aggregation and notifications
|
||||
│ │ ├── config/ # TOML configuration loading
|
||||
│ │ └── communication/ # ZMQ message handling
|
||||
├── dashboard/ # TUI dashboard application
|
||||
│ ├── src/
|
||||
│ │ ├── ui/ # Widget implementations
|
||||
│ │ ├── data/ # Data structures
|
||||
│ │ └── app.rs # Application state
|
||||
├── shared/ # Common data structures
|
||||
└── config/ # Configuration files
|
||||
│ │ ├── ui/widgets/ # CPU, memory, services, backup, system
|
||||
│ │ ├── communication/ # ZMQ consumption and commands
|
||||
│ │ └── app.rs # Main application loop
|
||||
├── shared/ # Shared types and utilities
|
||||
│ └── src/
|
||||
│ ├── metrics.rs # Metric, Status, StatusTracker types
|
||||
│ ├── protocol.rs # ZMQ message format
|
||||
│ └── cache.rs # Cache configuration
|
||||
└── CLAUDE.md # Development guidelines and rules
|
||||
```
|
||||
|
||||
### Development Commands
|
||||
### Testing
|
||||
```bash
|
||||
# Format code
|
||||
cargo fmt
|
||||
# Build and test
|
||||
nix-shell -p openssl pkg-config --run "cargo build --workspace"
|
||||
nix-shell -p openssl pkg-config --run "cargo test --workspace"
|
||||
|
||||
# Check all packages
|
||||
cargo check
|
||||
|
||||
# Run tests
|
||||
cargo test
|
||||
|
||||
# Build release
|
||||
cargo build --release
|
||||
|
||||
# Run with logging
|
||||
RUST_LOG=debug cargo run -p cm-dashboard-agent
|
||||
# Code quality
|
||||
cargo fmt --all
|
||||
cargo clippy --workspace -- -D warnings
|
||||
```
|
||||
|
||||
### Architecture Principles
|
||||
## Deployment
|
||||
|
||||
#### Status Calculation Rules
|
||||
- **Agent calculates all status** using predefined thresholds
|
||||
- **Dashboard never calculates status** - only displays agent data
|
||||
- **No hardcoded thresholds in dashboard** widgets
|
||||
- **Use "unknown" when agent status missing** (never default to "ok")
|
||||
|
||||
#### Data Flow
|
||||
```
|
||||
System Metrics → Agent Collectors → Status Calculation → ZMQ → Dashboard → Display
|
||||
↓
|
||||
Email Notifications
|
||||
```
|
||||
|
||||
#### Pure Auto-Discovery
|
||||
- **No config files required** for basic operation
|
||||
- **Runtime discovery** of system capabilities
|
||||
- **Service auto-detection** via systemd patterns
|
||||
- **Storage device enumeration** via /sys filesystem
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### Agent Won't Start
|
||||
### Automated Binary Releases
|
||||
```bash
|
||||
# Check permissions (agent requires root)
|
||||
sudo cm-dashboard-agent -v
|
||||
|
||||
# Verify ZMQ binding
|
||||
sudo netstat -tulpn | grep 6130
|
||||
|
||||
# Check system access
|
||||
sudo smartctl --scan
|
||||
# Create new release
|
||||
cd ~/projects/cm-dashboard
|
||||
git tag v0.1.X
|
||||
git push origin v0.1.X
|
||||
```
|
||||
|
||||
#### Dashboard Connection Issues
|
||||
```bash
|
||||
# Test ZMQ connectivity
|
||||
cm-dashboard --zmq-endpoint tcp://target-host:6130 -v
|
||||
This triggers automated:
|
||||
- Static binary compilation with `RUSTFLAGS="-C target-feature=+crt-static"`
|
||||
- GitHub-style release creation
|
||||
- Tarball upload to Gitea
|
||||
|
||||
# Check network connectivity
|
||||
telnet target-host 6130
|
||||
```
|
||||
### NixOS Integration
|
||||
Update `~/projects/nixosbox/hosts/services/cm-dashboard.nix`:
|
||||
|
||||
#### Email Notifications Not Working
|
||||
```bash
|
||||
# Check postfix status
|
||||
sudo systemctl status postfix
|
||||
|
||||
# Test SMTP manually
|
||||
telnet localhost 25
|
||||
|
||||
# Verify notification settings
|
||||
sudo cm-dashboard-agent -v | grep notification
|
||||
```
|
||||
|
||||
### Logging
|
||||
Set `RUST_LOG=debug` for detailed logging:
|
||||
```bash
|
||||
RUST_LOG=debug sudo cm-dashboard-agent
|
||||
RUST_LOG=debug cm-dashboard
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see LICENSE file for details.
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create feature branch (`git checkout -b feature/amazing-feature`)
|
||||
3. Commit changes (`git commit -m 'Add amazing feature'`)
|
||||
4. Push to branch (`git push origin feature/amazing-feature`)
|
||||
5. Open Pull Request
|
||||
|
||||
For bugs and feature requests, please use GitHub Issues.
|
||||
|
||||
## NixOS Integration
|
||||
|
||||
### Updating cm-dashboard in NixOS Configuration
|
||||
|
||||
When new code is pushed to the cm-dashboard repository, follow these steps to update the NixOS configuration:
|
||||
|
||||
#### 1. Get the Latest Commit Hash
|
||||
```bash
|
||||
# Get the latest commit from the API
|
||||
curl -s "https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/commits?sha=main&limit=1" | head -20
|
||||
|
||||
# Or use git
|
||||
git log --oneline -1
|
||||
```
|
||||
|
||||
#### 2. Update the NixOS Configuration
|
||||
Edit `hosts/common/cm-dashboard.nix` and update the `rev` field:
|
||||
```nix
|
||||
src = pkgs.fetchFromGitea {
|
||||
domain = "gitea.cmtec.se";
|
||||
owner = "cm";
|
||||
repo = "cm-dashboard";
|
||||
rev = "f786d054f2ece80823f85e46933857af96e241b2"; # Update this
|
||||
hash = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA="; # Reset temporarily
|
||||
version = "v0.1.43";
|
||||
src = pkgs.fetchurl {
|
||||
url = "https://gitea.cmtec.se/cm/cm-dashboard/releases/download/${version}/cm-dashboard-linux-x86_64.tar.gz";
|
||||
sha256 = "sha256-HASH";
|
||||
};
|
||||
```
|
||||
|
||||
#### 3. Get the Correct Hash
|
||||
Build with placeholder hash to get the actual hash:
|
||||
Get hash via:
|
||||
```bash
|
||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchFromGitea {
|
||||
domain = "gitea.cmtec.se";
|
||||
owner = "cm";
|
||||
repo = "cm-dashboard";
|
||||
rev = "YOUR_COMMIT_HASH";
|
||||
hash = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||
cd ~/projects/nixosbox
|
||||
nix-build --no-out-link -E 'with import <nixpkgs> {}; fetchurl {
|
||||
url = "URL_HERE";
|
||||
sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
|
||||
}' 2>&1 | grep "got:"
|
||||
```
|
||||
|
||||
Example output:
|
||||
```
|
||||
error: hash mismatch in fixed-output derivation '/nix/store/...':
|
||||
specified: sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=
|
||||
got: sha256-x8crxNusOUYRrkP9mYEOG+Ga3JCPIdJLkEAc5P1ZxdQ=
|
||||
```
|
||||
## Monitoring Intervals
|
||||
|
||||
#### 4. Update the Hash
|
||||
Replace the placeholder with the correct hash from the error message (the "got:" line):
|
||||
```nix
|
||||
hash = "sha256-vjy+j91iDCHUf0RE43anK4WZ+rKcyohP/3SykwZGof8="; # Use actual hash
|
||||
```
|
||||
- **Metrics Collection**: 2 seconds (CPU, memory, services)
|
||||
- **Metric Transmission**: 2 seconds (ZMQ publish)
|
||||
- **Dashboard Updates**: 1 second (UI refresh)
|
||||
- **Email Notifications**: 30 seconds (batched)
|
||||
- **Disk Monitoring**: 300 seconds (5 minutes)
|
||||
- **Service Discovery**: 300 seconds (5 minutes cache)
|
||||
|
||||
#### 5. Update Cargo Dependencies (if needed)
|
||||
If Cargo.lock has changed, you may need to update `cargoHash`:
|
||||
```bash
|
||||
# Build to get cargo hash error
|
||||
nix-build --no-out-link --expr 'with import <nixpkgs> {}; rustPlatform.buildRustPackage rec {
|
||||
pname = "cm-dashboard";
|
||||
version = "0.1.0";
|
||||
src = fetchFromGitea {
|
||||
domain = "gitea.cmtec.se";
|
||||
owner = "cm";
|
||||
repo = "cm-dashboard";
|
||||
rev = "YOUR_COMMIT_HASH";
|
||||
hash = "YOUR_SOURCE_HASH";
|
||||
};
|
||||
cargoHash = "";
|
||||
nativeBuildInputs = [ pkg-config ];
|
||||
buildInputs = [ openssl ];
|
||||
buildAndTestSubdir = ".";
|
||||
cargoBuildFlags = [ "--workspace" ];
|
||||
}' 2>&1 | grep "got:"
|
||||
```
|
||||
## License
|
||||
|
||||
Then update `cargoHash` in the configuration.
|
||||
|
||||
#### 6. Commit the Changes
|
||||
```bash
|
||||
git add hosts/common/cm-dashboard.nix
|
||||
git commit -m "Update cm-dashboard to latest version"
|
||||
git push
|
||||
```
|
||||
|
||||
### Example Update Process
|
||||
```bash
|
||||
# 1. Get latest commit
|
||||
LATEST_COMMIT=$(curl -s "https://gitea.cmtec.se/api/v1/repos/cm/cm-dashboard/commits?sha=main&limit=1" | grep '"sha"' | head -1 | cut -d'"' -f4)
|
||||
|
||||
# 2. Get source hash
|
||||
SOURCE_HASH=$(nix-build --no-out-link -E "with import <nixpkgs> {}; fetchFromGitea { domain = \"gitea.cmtec.se\"; owner = \"cm\"; repo = \"cm-dashboard\"; rev = \"$LATEST_COMMIT\"; hash = \"sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=\"; }" 2>&1 | grep "got:" | cut -d' ' -f12)
|
||||
|
||||
# 3. Update configuration and commit
|
||||
echo "Latest commit: $LATEST_COMMIT"
|
||||
echo "Source hash: $SOURCE_HASH"
|
||||
```
|
||||
MIT License - see LICENSE file for details.
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.0"
|
||||
version = "0.1.184"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
@@ -17,6 +17,7 @@ tracing = { workspace = true }
|
||||
tracing-subscriber = { workspace = true }
|
||||
lettre = { workspace = true }
|
||||
gethostname = { workspace = true }
|
||||
chrono-tz = "0.8"
|
||||
toml = { workspace = true }
|
||||
async-trait = "0.1"
|
||||
reqwest = { version = "0.11", features = ["json", "blocking"] }
|
||||
@@ -1,91 +1,138 @@
|
||||
use anyhow::Result;
|
||||
use gethostname::gethostname;
|
||||
use std::time::Duration;
|
||||
use tokio::time::interval;
|
||||
use tracing::{info, error, debug};
|
||||
use gethostname::gethostname;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::communication::{AgentCommand, ZmqHandler};
|
||||
use crate::config::AgentConfig;
|
||||
use crate::communication::{ZmqHandler, AgentCommand};
|
||||
use crate::metrics::MetricCollectionManager;
|
||||
use crate::collectors::{
|
||||
Collector,
|
||||
backup::BackupCollector,
|
||||
cpu::CpuCollector,
|
||||
disk::DiskCollector,
|
||||
memory::MemoryCollector,
|
||||
network::NetworkCollector,
|
||||
nixos::NixOSCollector,
|
||||
systemd::SystemdCollector,
|
||||
};
|
||||
use crate::notifications::NotificationManager;
|
||||
use cm_dashboard_shared::{Metric, MetricMessage};
|
||||
use cm_dashboard_shared::AgentData;
|
||||
|
||||
pub struct Agent {
|
||||
hostname: String,
|
||||
config: AgentConfig,
|
||||
zmq_handler: ZmqHandler,
|
||||
metric_manager: MetricCollectionManager,
|
||||
collectors: Vec<Box<dyn Collector>>,
|
||||
notification_manager: NotificationManager,
|
||||
previous_status: Option<SystemStatus>,
|
||||
}
|
||||
|
||||
/// Track system component status for change detection
|
||||
#[derive(Debug, Clone)]
|
||||
struct SystemStatus {
|
||||
cpu_load_status: cm_dashboard_shared::Status,
|
||||
cpu_temperature_status: cm_dashboard_shared::Status,
|
||||
memory_usage_status: cm_dashboard_shared::Status,
|
||||
// Add more as needed
|
||||
}
|
||||
|
||||
impl Agent {
|
||||
pub async fn new(config_path: Option<String>) -> Result<Self> {
|
||||
let hostname = gethostname().to_string_lossy().to_string();
|
||||
info!("Initializing agent for host: {}", hostname);
|
||||
|
||||
// Load configuration
|
||||
let config = if let Some(path) = config_path {
|
||||
AgentConfig::load_from_file(&path)?
|
||||
} else {
|
||||
AgentConfig::default()
|
||||
};
|
||||
|
||||
|
||||
// Load configuration (now required)
|
||||
let config_path = config_path.ok_or_else(|| anyhow::anyhow!("Configuration file path is required"))?;
|
||||
let config = AgentConfig::from_file(&config_path)?;
|
||||
|
||||
info!("Agent configuration loaded");
|
||||
|
||||
|
||||
// Initialize ZMQ communication
|
||||
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
|
||||
info!("ZMQ communication initialized on port {}", config.zmq.publisher_port);
|
||||
info!(
|
||||
"ZMQ communication initialized on port {}",
|
||||
config.zmq.publisher_port
|
||||
);
|
||||
|
||||
// Initialize collectors
|
||||
let mut collectors: Vec<Box<dyn Collector>> = Vec::new();
|
||||
|
||||
// Initialize metric collection manager with cache config
|
||||
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
||||
info!("Metric collection manager initialized");
|
||||
// Add enabled collectors
|
||||
if config.collectors.cpu.enabled {
|
||||
collectors.push(Box::new(CpuCollector::new(config.collectors.cpu.clone())));
|
||||
}
|
||||
|
||||
if config.collectors.memory.enabled {
|
||||
collectors.push(Box::new(MemoryCollector::new(config.collectors.memory.clone())));
|
||||
}
|
||||
|
||||
if config.collectors.disk.enabled {
|
||||
collectors.push(Box::new(DiskCollector::new(config.collectors.disk.clone())));
|
||||
}
|
||||
|
||||
if config.collectors.systemd.enabled {
|
||||
collectors.push(Box::new(SystemdCollector::new(config.collectors.systemd.clone())));
|
||||
}
|
||||
|
||||
if config.collectors.backup.enabled {
|
||||
collectors.push(Box::new(BackupCollector::new()));
|
||||
}
|
||||
|
||||
if config.collectors.network.enabled {
|
||||
collectors.push(Box::new(NetworkCollector::new(config.collectors.network.clone())));
|
||||
}
|
||||
|
||||
if config.collectors.nixos.enabled {
|
||||
collectors.push(Box::new(NixOSCollector::new(config.collectors.nixos.clone())));
|
||||
}
|
||||
|
||||
info!("Initialized {} collectors", collectors.len());
|
||||
|
||||
// Initialize notification manager
|
||||
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
|
||||
info!("Notification manager initialized");
|
||||
|
||||
|
||||
Ok(Self {
|
||||
hostname,
|
||||
config,
|
||||
zmq_handler,
|
||||
metric_manager,
|
||||
collectors,
|
||||
notification_manager,
|
||||
previous_status: None,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Main agent loop with structured data collection
|
||||
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
|
||||
info!("Starting agent main loop with separated collection and transmission");
|
||||
|
||||
// CRITICAL: Collect ALL data immediately at startup before entering the loop
|
||||
info!("Performing initial FORCE collection of all metrics at startup");
|
||||
if let Err(e) = self.collect_all_metrics_force().await {
|
||||
error!("Failed to collect initial metrics: {}", e);
|
||||
} else {
|
||||
info!("Initial metric collection completed - all data cached and ready");
|
||||
info!("Starting agent main loop");
|
||||
|
||||
// Initial collection
|
||||
if let Err(e) = self.collect_and_broadcast().await {
|
||||
error!("Initial metric collection failed: {}", e);
|
||||
}
|
||||
|
||||
// Separate intervals for collection and transmission
|
||||
let mut collection_interval = interval(Duration::from_secs(self.config.collection_interval_seconds));
|
||||
let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
|
||||
let mut notification_check_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
||||
|
||||
|
||||
// Set up intervals
|
||||
let mut transmission_interval = interval(Duration::from_secs(
|
||||
self.config.collection_interval_seconds,
|
||||
));
|
||||
let mut notification_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
||||
|
||||
// Skip initial ticks to avoid immediate execution
|
||||
transmission_interval.tick().await;
|
||||
notification_interval.tick().await;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = collection_interval.tick() => {
|
||||
// Only collect and cache metrics, no ZMQ transmission
|
||||
if let Err(e) = self.collect_metrics_only().await {
|
||||
error!("Failed to collect metrics: {}", e);
|
||||
}
|
||||
}
|
||||
_ = transmission_interval.tick() => {
|
||||
// Send all cached metrics via ZMQ every 1 second
|
||||
if let Err(e) = self.broadcast_all_cached_metrics().await {
|
||||
error!("Failed to broadcast cached metrics: {}", e);
|
||||
if let Err(e) = self.collect_and_broadcast().await {
|
||||
error!("Failed to collect and broadcast metrics: {}", e);
|
||||
}
|
||||
}
|
||||
_ = notification_check_interval.tick() => {
|
||||
// Handle any pending notifications
|
||||
self.notification_manager.process_pending().await;
|
||||
_ = notification_interval.tick() => {
|
||||
// Process any pending notifications
|
||||
// NOTE: With structured data, we might need to implement status tracking differently
|
||||
// For now, we skip this until status evaluation is migrated
|
||||
}
|
||||
// Handle incoming commands (check periodically)
|
||||
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
||||
@@ -99,125 +146,149 @@ impl Agent {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
info!("Agent main loop stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn collect_all_metrics_force(&mut self) -> Result<()> {
|
||||
info!("Starting FORCE metric collection for startup");
|
||||
|
||||
// Force collect all metrics from all collectors immediately
|
||||
let metrics = self.metric_manager.collect_all_metrics_force().await?;
|
||||
|
||||
if metrics.is_empty() {
|
||||
error!("No metrics collected during force collection!");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!("Force collected and cached {} metrics", metrics.len());
|
||||
|
||||
// Check for status changes and send notifications
|
||||
self.check_status_changes(&metrics).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn collect_metrics_only(&mut self) -> Result<()> {
|
||||
debug!("Starting metric collection cycle (cache only)");
|
||||
|
||||
// Collect all metrics from all collectors and cache them
|
||||
let metrics = self.metric_manager.collect_all_metrics().await?;
|
||||
|
||||
if metrics.is_empty() {
|
||||
debug!("No metrics collected this cycle");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
debug!("Collected and cached {} metrics", metrics.len());
|
||||
|
||||
// Check for status changes and send notifications
|
||||
self.check_status_changes(&metrics).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn broadcast_all_cached_metrics(&mut self) -> Result<()> {
|
||||
debug!("Broadcasting all cached metrics via ZMQ");
|
||||
|
||||
// Get all cached metrics from the metric manager
|
||||
let cached_metrics = self.metric_manager.get_all_cached_metrics().await?;
|
||||
|
||||
if cached_metrics.is_empty() {
|
||||
debug!("No cached metrics to broadcast");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
debug!("Broadcasting {} cached metrics", cached_metrics.len());
|
||||
|
||||
// Create and send message with all cached data
|
||||
let message = MetricMessage::new(self.hostname.clone(), cached_metrics);
|
||||
self.zmq_handler.publish_metrics(&message).await?;
|
||||
|
||||
debug!("Cached metrics broadcasted successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn check_status_changes(&mut self, metrics: &[Metric]) {
|
||||
for metric in metrics {
|
||||
if let Some(status_change) = self.notification_manager.update_metric_status(&metric.name, metric.status) {
|
||||
info!("Status change detected for {}: {:?} -> {:?}",
|
||||
metric.name, status_change.old_status, status_change.new_status);
|
||||
|
||||
// Send notification for status change
|
||||
if let Err(e) = self.notification_manager.send_status_change_notification(status_change, metric).await {
|
||||
error!("Failed to send notification: {}", e);
|
||||
}
|
||||
|
||||
/// Collect structured data from all collectors and broadcast via ZMQ
|
||||
async fn collect_and_broadcast(&mut self) -> Result<()> {
|
||||
debug!("Starting structured data collection");
|
||||
|
||||
// Initialize empty AgentData
|
||||
let mut agent_data = AgentData::new(self.hostname.clone(), env!("CARGO_PKG_VERSION").to_string());
|
||||
|
||||
// Collect data from all collectors
|
||||
for collector in &self.collectors {
|
||||
if let Err(e) = collector.collect_structured(&mut agent_data).await {
|
||||
error!("Collector failed: {}", e);
|
||||
// Continue with other collectors even if one fails
|
||||
}
|
||||
}
|
||||
|
||||
// Check for status changes and send notifications
|
||||
if let Err(e) = self.check_status_changes_and_notify(&agent_data).await {
|
||||
error!("Failed to check status changes: {}", e);
|
||||
}
|
||||
|
||||
// Broadcast the structured data via ZMQ
|
||||
if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await {
|
||||
error!("Failed to broadcast agent data: {}", e);
|
||||
} else {
|
||||
debug!("Successfully broadcast structured agent data");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Check for status changes and send notifications
|
||||
async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> {
|
||||
// Extract current status
|
||||
let current_status = SystemStatus {
|
||||
cpu_load_status: agent_data.system.cpu.load_status.clone(),
|
||||
cpu_temperature_status: agent_data.system.cpu.temperature_status.clone(),
|
||||
memory_usage_status: agent_data.system.memory.usage_status.clone(),
|
||||
};
|
||||
|
||||
// Check for status changes
|
||||
if let Some(previous) = self.previous_status.clone() {
|
||||
self.check_and_notify_status_change(
|
||||
"CPU Load",
|
||||
&previous.cpu_load_status,
|
||||
¤t_status.cpu_load_status,
|
||||
format!("CPU load: {:.1}", agent_data.system.cpu.load_1min)
|
||||
).await?;
|
||||
|
||||
self.check_and_notify_status_change(
|
||||
"CPU Temperature",
|
||||
&previous.cpu_temperature_status,
|
||||
¤t_status.cpu_temperature_status,
|
||||
format!("CPU temperature: {}°C",
|
||||
agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32)
|
||||
).await?;
|
||||
|
||||
self.check_and_notify_status_change(
|
||||
"Memory Usage",
|
||||
&previous.memory_usage_status,
|
||||
¤t_status.memory_usage_status,
|
||||
format!("Memory usage: {:.1}%", agent_data.system.memory.usage_percent)
|
||||
).await?;
|
||||
}
|
||||
|
||||
// Store current status for next comparison
|
||||
self.previous_status = Some(current_status);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check individual status change and send notification if degraded
|
||||
async fn check_and_notify_status_change(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &cm_dashboard_shared::Status,
|
||||
current: &cm_dashboard_shared::Status,
|
||||
details: String
|
||||
) -> Result<()> {
|
||||
use cm_dashboard_shared::Status;
|
||||
|
||||
// Only notify on status degradation (OK → Warning/Critical, Warning → Critical)
|
||||
let should_notify = match (previous, current) {
|
||||
(Status::Ok, Status::Warning) => true,
|
||||
(Status::Ok, Status::Critical) => true,
|
||||
(Status::Warning, Status::Critical) => true,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
if should_notify {
|
||||
let subject = format!("{} {} Alert", self.hostname, component);
|
||||
let body = format!(
|
||||
"Alert: {} status changed from {:?} to {:?}\n\nDetails: {}\n\nTime: {}",
|
||||
component,
|
||||
previous,
|
||||
current,
|
||||
details,
|
||||
chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
|
||||
);
|
||||
|
||||
info!("Sending notification: {} - {:?} → {:?}", component, previous, current);
|
||||
|
||||
if let Err(e) = self.notification_manager.send_direct_email(&subject, &body).await {
|
||||
error!("Failed to send notification for {}: {}", component, e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle incoming commands from dashboard
|
||||
async fn handle_commands(&mut self) -> Result<()> {
|
||||
// Try to receive commands (non-blocking)
|
||||
match self.zmq_handler.try_receive_command() {
|
||||
Ok(Some(command)) => {
|
||||
info!("Received command: {:?}", command);
|
||||
self.process_command(command).await?;
|
||||
}
|
||||
Ok(None) => {
|
||||
// No command available - this is normal
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Error receiving command: {}", e);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
|
||||
match command {
|
||||
AgentCommand::CollectNow => {
|
||||
info!("Processing CollectNow command");
|
||||
if let Err(e) = self.collect_metrics_only().await {
|
||||
error!("Failed to collect metrics on command: {}", e);
|
||||
// Try to receive a command (non-blocking)
|
||||
if let Ok(Some(command)) = self.zmq_handler.try_receive_command() {
|
||||
info!("Received command: {:?}", command);
|
||||
|
||||
match command {
|
||||
AgentCommand::CollectNow => {
|
||||
info!("Received immediate collection request");
|
||||
if let Err(e) = self.collect_and_broadcast().await {
|
||||
error!("Failed to collect on demand: {}", e);
|
||||
}
|
||||
}
|
||||
AgentCommand::SetInterval { seconds } => {
|
||||
info!("Received interval change request: {}s", seconds);
|
||||
// Note: This would require more complex handling to update the interval
|
||||
// For now, just acknowledge
|
||||
}
|
||||
AgentCommand::ToggleCollector { name, enabled } => {
|
||||
info!("Received collector toggle request: {} -> {}", name, enabled);
|
||||
// Note: This would require more complex handling to enable/disable collectors
|
||||
// For now, just acknowledge
|
||||
}
|
||||
AgentCommand::Ping => {
|
||||
info!("Received ping command");
|
||||
// Maybe send back a pong or status
|
||||
}
|
||||
}
|
||||
AgentCommand::SetInterval { seconds } => {
|
||||
info!("Processing SetInterval command: {} seconds", seconds);
|
||||
// Note: This would require modifying the interval, which is complex
|
||||
// For now, just log the request
|
||||
info!("Interval change requested but not implemented yet");
|
||||
}
|
||||
AgentCommand::ToggleCollector { name, enabled } => {
|
||||
info!("Processing ToggleCollector command: {} -> {}", name, enabled);
|
||||
// Note: This would require dynamic collector management
|
||||
info!("Collector toggle requested but not implemented yet");
|
||||
}
|
||||
AgentCommand::Ping => {
|
||||
info!("Processing Ping command - agent is alive");
|
||||
// Could send a response back via ZMQ if needed
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
11
agent/src/cache/cached_metric.rs
vendored
11
agent/src/cache/cached_metric.rs
vendored
@@ -1,11 +0,0 @@
|
||||
use cm_dashboard_shared::{CacheTier, Metric};
|
||||
use std::time::Instant;
|
||||
|
||||
/// A cached metric with metadata
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CachedMetric {
|
||||
pub metric: Metric,
|
||||
pub collected_at: Instant,
|
||||
pub access_count: u64,
|
||||
pub tier: Option<CacheTier>,
|
||||
}
|
||||
36
agent/src/cache/manager.rs
vendored
36
agent/src/cache/manager.rs
vendored
@@ -1,36 +0,0 @@
|
||||
use super::ConfigurableCache;
|
||||
use cm_dashboard_shared::{CacheConfig, Metric};
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
|
||||
/// Manages metric caching with background tasks
|
||||
pub struct MetricCacheManager {
|
||||
cache: Arc<ConfigurableCache>,
|
||||
}
|
||||
|
||||
impl MetricCacheManager {
|
||||
pub fn new(config: CacheConfig) -> Self {
|
||||
let cache = Arc::new(ConfigurableCache::new(config.clone()));
|
||||
|
||||
Self {
|
||||
cache,
|
||||
}
|
||||
}
|
||||
|
||||
/// Start background cache management tasks
|
||||
pub async fn start_background_tasks(&self) {
|
||||
// Temporarily disabled to isolate CPU usage issue
|
||||
info!("Cache manager background tasks disabled for debugging");
|
||||
}
|
||||
|
||||
/// Store metric in cache
|
||||
pub async fn cache_metric(&self, metric: Metric) {
|
||||
self.cache.store_metric(metric).await;
|
||||
}
|
||||
|
||||
/// Get all cached metrics (including expired ones) for broadcasting
|
||||
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
|
||||
self.cache.get_all_cached_metrics().await
|
||||
}
|
||||
|
||||
}
|
||||
101
agent/src/cache/mod.rs
vendored
101
agent/src/cache/mod.rs
vendored
@@ -1,101 +0,0 @@
|
||||
use cm_dashboard_shared::{CacheConfig, Metric};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::warn;
|
||||
|
||||
mod manager;
|
||||
mod cached_metric;
|
||||
|
||||
pub use manager::MetricCacheManager;
|
||||
pub use cached_metric::CachedMetric;
|
||||
|
||||
/// Central cache for individual metrics with configurable tiers
|
||||
pub struct ConfigurableCache {
|
||||
cache: RwLock<HashMap<String, CachedMetric>>,
|
||||
config: CacheConfig,
|
||||
}
|
||||
|
||||
impl ConfigurableCache {
|
||||
pub fn new(config: CacheConfig) -> Self {
|
||||
Self {
|
||||
cache: RwLock::new(HashMap::new()),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Store metric in cache
|
||||
pub async fn store_metric(&self, metric: Metric) {
|
||||
if !self.config.enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut cache = self.cache.write().await;
|
||||
|
||||
// Enforce max entries limit
|
||||
if cache.len() >= self.config.max_entries {
|
||||
self.cleanup_old_entries(&mut cache).await;
|
||||
}
|
||||
|
||||
let cached_metric = CachedMetric {
|
||||
metric: metric.clone(),
|
||||
collected_at: Instant::now(),
|
||||
access_count: 1,
|
||||
tier: self.config.get_tier_for_metric(&metric.name).cloned(),
|
||||
};
|
||||
|
||||
cache.insert(metric.name.clone(), cached_metric);
|
||||
|
||||
// Cached metric (debug logging disabled for performance)
|
||||
}
|
||||
|
||||
|
||||
/// Get all cached metrics (including expired ones) for broadcasting
|
||||
pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
|
||||
if !self.config.enabled {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let cache = self.cache.read().await;
|
||||
let mut all_metrics = Vec::new();
|
||||
|
||||
for cached_metric in cache.values() {
|
||||
all_metrics.push(cached_metric.metric.clone());
|
||||
}
|
||||
|
||||
all_metrics
|
||||
}
|
||||
|
||||
/// Background cleanup of old entries
|
||||
async fn cleanup_old_entries(&self, cache: &mut HashMap<String, CachedMetric>) {
|
||||
let mut to_remove = Vec::new();
|
||||
|
||||
for (metric_name, cached_metric) in cache.iter() {
|
||||
let cache_interval = self.config.get_cache_interval(metric_name);
|
||||
let elapsed = cached_metric.collected_at.elapsed().as_secs();
|
||||
|
||||
// Remove entries that are way past their expiration (2x interval)
|
||||
if elapsed > cache_interval * 2 {
|
||||
to_remove.push(metric_name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for metric_name in to_remove {
|
||||
cache.remove(&metric_name);
|
||||
}
|
||||
|
||||
// If still too many entries, remove least recently accessed
|
||||
if cache.len() >= self.config.max_entries {
|
||||
let mut entries: Vec<_> = cache.iter().map(|(k, v)| (k.clone(), v.access_count)).collect();
|
||||
entries.sort_by_key(|(_, access_count)| *access_count);
|
||||
|
||||
let excess = cache.len() - (self.config.max_entries * 3 / 4); // Remove 25%
|
||||
for (metric_name, _) in entries.iter().take(excess) {
|
||||
cache.remove(metric_name);
|
||||
}
|
||||
|
||||
warn!("Cache cleanup removed {} entries due to size limit", excess);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,356 +1,120 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use chrono::Utc;
|
||||
use cm_dashboard_shared::{AgentData, BackupData, BackupDiskData};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tokio::fs;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use tracing::error;
|
||||
|
||||
/// Backup collector that reads TOML status files for borgbackup metrics
|
||||
#[derive(Debug, Clone)]
|
||||
/// Backup collector that reads backup status from TOML files with structured data output
|
||||
pub struct BackupCollector {
|
||||
pub backup_status_file: String,
|
||||
pub max_age_hours: u64,
|
||||
/// Path to backup status file
|
||||
status_file_path: String,
|
||||
}
|
||||
|
||||
impl BackupCollector {
|
||||
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||
max_age_hours,
|
||||
status_file_path: "/var/lib/backup/backup-status.toml".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_backup_status(&self) -> Result<BackupStatusToml, CollectorError> {
|
||||
let content = fs::read_to_string(&self.backup_status_file)
|
||||
.await
|
||||
/// Read backup status from TOML file
|
||||
async fn read_backup_status(&self) -> Result<Option<BackupStatusToml>, CollectorError> {
|
||||
if !Path::new(&self.status_file_path).exists() {
|
||||
debug!("Backup status file not found: {}", self.status_file_path);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let content = fs::read_to_string(&self.status_file_path)
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: self.backup_status_file.clone(),
|
||||
path: self.status_file_path.clone(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
toml::from_str(&content).map_err(|e| CollectorError::Parse {
|
||||
value: "backup status TOML".to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
let status: BackupStatusToml = toml::from_str(&content)
|
||||
.map_err(|e| CollectorError::Parse {
|
||||
value: content.clone(),
|
||||
error: format!("Failed to parse backup status TOML: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(Some(status))
|
||||
}
|
||||
|
||||
fn calculate_backup_status(&self, backup_status: &BackupStatusToml) -> Status {
|
||||
// Parse the start time to check age - handle both RFC3339 and local timestamp formats
|
||||
let start_time = match chrono::DateTime::parse_from_rfc3339(&backup_status.start_time) {
|
||||
Ok(dt) => dt.with_timezone(&Utc),
|
||||
Err(_) => {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
|
||||
Ok(naive_dt) => naive_dt.and_utc(),
|
||||
Err(_) => {
|
||||
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
|
||||
return Status::Unknown;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
/// Convert BackupStatusToml to BackupData and populate AgentData
|
||||
async fn populate_backup_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
if let Some(backup_status) = self.read_backup_status().await? {
|
||||
// Use raw start_time string from TOML
|
||||
|
||||
let hours_since_backup = Utc::now().signed_duration_since(start_time).num_hours();
|
||||
// Extract disk information
|
||||
let repository_disk = if let Some(disk_space) = &backup_status.disk_space {
|
||||
Some(BackupDiskData {
|
||||
serial: backup_status.disk_serial_number.clone().unwrap_or_else(|| "Unknown".to_string()),
|
||||
usage_percent: disk_space.usage_percent as f32,
|
||||
used_gb: disk_space.used_gb as f32,
|
||||
total_gb: disk_space.total_gb as f32,
|
||||
wear_percent: backup_status.disk_wear_percent,
|
||||
temperature_celsius: None, // Not available in current TOML
|
||||
})
|
||||
} else if let Some(serial) = &backup_status.disk_serial_number {
|
||||
// Fallback: create minimal disk info if we have serial but no disk_space
|
||||
Some(BackupDiskData {
|
||||
serial: serial.clone(),
|
||||
usage_percent: 0.0,
|
||||
used_gb: 0.0,
|
||||
total_gb: 0.0,
|
||||
wear_percent: backup_status.disk_wear_percent,
|
||||
temperature_celsius: None,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Check overall backup status
|
||||
match backup_status.status.as_str() {
|
||||
"success" => {
|
||||
if hours_since_backup > self.max_age_hours as i64 {
|
||||
Status::Warning // Backup too old
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
},
|
||||
"failed" => Status::Critical,
|
||||
"running" => Status::Ok, // Currently running is OK
|
||||
_ => Status::Unknown,
|
||||
// Calculate total repository size from services
|
||||
let total_size_gb = backup_status.services
|
||||
.values()
|
||||
.map(|service| service.repo_size_bytes as f32 / (1024.0 * 1024.0 * 1024.0))
|
||||
.sum::<f32>();
|
||||
|
||||
let backup_data = BackupData {
|
||||
status: backup_status.status,
|
||||
total_size_gb: Some(total_size_gb),
|
||||
repository_health: Some("ok".to_string()), // Derive from status if needed
|
||||
repository_disk,
|
||||
last_backup_size_gb: None, // Not available in current TOML format
|
||||
start_time_raw: Some(backup_status.start_time),
|
||||
};
|
||||
|
||||
agent_data.backup = backup_data;
|
||||
} else {
|
||||
// No backup status available - set default values
|
||||
agent_data.backup = BackupData {
|
||||
status: "unavailable".to_string(),
|
||||
total_size_gb: None,
|
||||
repository_health: None,
|
||||
repository_disk: None,
|
||||
last_backup_size_gb: None,
|
||||
start_time_raw: None,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_service_status(&self, service: &ServiceStatus) -> Status {
|
||||
match service.status.as_str() {
|
||||
"completed" => {
|
||||
if service.exit_code == 0 {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
}
|
||||
},
|
||||
"failed" => Status::Critical,
|
||||
"disabled" => Status::Warning, // Service intentionally disabled
|
||||
"running" => Status::Ok,
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
fn bytes_to_gb(bytes: u64) -> f32 {
|
||||
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for BackupCollector {
|
||||
fn name(&self) -> &str {
|
||||
"backup"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
let backup_status = self.read_backup_status().await?;
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Overall backup status
|
||||
let overall_status = self.calculate_backup_status(&backup_status);
|
||||
metrics.push(Metric {
|
||||
name: "backup_overall_status".to_string(),
|
||||
value: MetricValue::String(match overall_status {
|
||||
Status::Ok => "ok".to_string(),
|
||||
Status::Warning => "warning".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
}),
|
||||
status: overall_status,
|
||||
timestamp,
|
||||
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
// Backup duration
|
||||
metrics.push(Metric {
|
||||
name: "backup_duration_seconds".to_string(),
|
||||
value: MetricValue::Integer(backup_status.duration_seconds),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Duration of last backup run".to_string()),
|
||||
unit: Some("seconds".to_string()),
|
||||
});
|
||||
|
||||
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
||||
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
.or_else(|_| {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
|
||||
.map(|naive_dt| naive_dt.and_utc())
|
||||
});
|
||||
|
||||
if let Ok(last_updated_dt) = last_updated_dt_result {
|
||||
metrics.push(Metric {
|
||||
name: "backup_last_run_timestamp".to_string(),
|
||||
value: MetricValue::Integer(last_updated_dt.timestamp()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Timestamp of last backup completion".to_string()),
|
||||
unit: Some("unix_timestamp".to_string()),
|
||||
});
|
||||
} else {
|
||||
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
|
||||
}
|
||||
|
||||
// Individual service metrics
|
||||
for (service_name, service) in &backup_status.services {
|
||||
let service_status = self.calculate_service_status(service);
|
||||
|
||||
// Service status
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_status", service_name),
|
||||
value: MetricValue::String(match service_status {
|
||||
Status::Ok => "ok".to_string(),
|
||||
Status::Warning => "warning".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
}),
|
||||
status: service_status,
|
||||
timestamp,
|
||||
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
// Service exit code
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_exit_code", service_name),
|
||||
value: MetricValue::Integer(service.exit_code),
|
||||
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
|
||||
timestamp,
|
||||
description: Some(format!("Exit code for backup service {}", service_name)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
// Repository archive count
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_archive_count", service_name),
|
||||
value: MetricValue::Integer(service.archive_count),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Number of archives in {} repository", service_name)),
|
||||
unit: Some("archives".to_string()),
|
||||
});
|
||||
|
||||
// Repository size in GB
|
||||
let repo_size_gb = Self::bytes_to_gb(service.repo_size_bytes);
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_repo_size_gb", service_name),
|
||||
value: MetricValue::Float(repo_size_gb),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Repository size for {} in GB", service_name)),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
// Repository path for reference
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_repo_path", service_name),
|
||||
value: MetricValue::String(service.repo_path.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Repository path for {}", service_name)),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Total number of services
|
||||
metrics.push(Metric {
|
||||
name: "backup_total_services".to_string(),
|
||||
value: MetricValue::Integer(backup_status.services.len() as i64),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Total number of backup services".to_string()),
|
||||
unit: Some("services".to_string()),
|
||||
});
|
||||
|
||||
// Calculate total repository size
|
||||
let total_size_bytes: u64 = backup_status.services.values()
|
||||
.map(|s| s.repo_size_bytes)
|
||||
.sum();
|
||||
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
||||
metrics.push(Metric {
|
||||
name: "backup_total_repo_size_gb".to_string(),
|
||||
value: MetricValue::Float(total_size_gb),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Total size of all backup repositories".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
// Disk space metrics for backup directory
|
||||
if let Some(ref disk_space) = backup_status.disk_space {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_total_gb".to_string(),
|
||||
value: MetricValue::Float(disk_space.total_gb as f32),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Total disk space available for backups".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_used_gb".to_string(),
|
||||
value: MetricValue::Float(disk_space.used_gb as f32),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Used disk space on backup drive".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_available_gb".to_string(),
|
||||
value: MetricValue::Float(disk_space.available_gb as f32),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Available disk space on backup drive".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_usage_percent".to_string(),
|
||||
value: MetricValue::Float(disk_space.usage_percent as f32),
|
||||
status: if disk_space.usage_percent >= 95.0 {
|
||||
Status::Critical
|
||||
} else if disk_space.usage_percent >= 85.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
},
|
||||
timestamp,
|
||||
description: Some("Backup disk usage percentage".to_string()),
|
||||
unit: Some("percent".to_string()),
|
||||
});
|
||||
|
||||
// Add disk identification metrics if available from disk_space
|
||||
if let Some(ref product_name) = disk_space.product_name {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_product_name".to_string(),
|
||||
value: MetricValue::String(product_name.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk product name from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref serial_number) = disk_space.serial_number {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_serial_number".to_string(),
|
||||
value: MetricValue::String(serial_number.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk serial number from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Add standalone disk identification metrics from TOML fields
|
||||
if let Some(ref product_name) = backup_status.disk_product_name {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_product_name".to_string(),
|
||||
value: MetricValue::String(product_name.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk product name from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref serial_number) = backup_status.disk_serial_number {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_serial_number".to_string(),
|
||||
value: MetricValue::String(serial_number.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk serial number from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Count services by status
|
||||
let mut status_counts = HashMap::new();
|
||||
for service in backup_status.services.values() {
|
||||
*status_counts.entry(service.status.clone()).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
for (status_name, count) in status_counts {
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_services_{}_count", status_name),
|
||||
value: MetricValue::Integer(count),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Number of services with status: {}", status_name)),
|
||||
unit: Some("services".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(metrics)
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting backup status");
|
||||
self.populate_backup_data(agent_data).await
|
||||
}
|
||||
}
|
||||
|
||||
/// TOML structure for backup status file
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct BackupStatusToml {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct BackupStatusToml {
|
||||
pub backup_name: String,
|
||||
pub start_time: String,
|
||||
pub current_time: String,
|
||||
@@ -360,11 +124,12 @@ pub struct BackupStatusToml {
|
||||
pub disk_space: Option<DiskSpace>,
|
||||
pub disk_product_name: Option<String>,
|
||||
pub disk_serial_number: Option<String>,
|
||||
pub disk_wear_percent: Option<f32>,
|
||||
pub services: HashMap<String, ServiceStatus>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct DiskSpace {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct DiskSpace {
|
||||
pub total_bytes: u64,
|
||||
pub used_bytes: u64,
|
||||
pub available_bytes: u64,
|
||||
@@ -372,13 +137,10 @@ pub struct DiskSpace {
|
||||
pub used_gb: f64,
|
||||
pub available_gb: f64,
|
||||
pub usage_percent: f64,
|
||||
// Optional disk identification fields
|
||||
pub product_name: Option<String>,
|
||||
pub serial_number: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct ServiceStatus {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct ServiceStatus {
|
||||
pub status: String,
|
||||
pub exit_code: i64,
|
||||
pub repo_path: String,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
@@ -15,43 +15,54 @@ use crate::config::CpuConfig;
|
||||
/// - No process spawning
|
||||
/// - <0.1ms collection time target
|
||||
pub struct CpuCollector {
|
||||
config: CpuConfig,
|
||||
name: String,
|
||||
load_thresholds: HysteresisThresholds,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
impl CpuCollector {
|
||||
pub fn new(config: CpuConfig) -> Self {
|
||||
// Create hysteresis thresholds with 10% gap for recovery
|
||||
let load_thresholds = HysteresisThresholds::new(
|
||||
config.load_warning_threshold,
|
||||
config.load_critical_threshold,
|
||||
);
|
||||
|
||||
let temperature_thresholds = HysteresisThresholds::new(
|
||||
config.temperature_warning_threshold,
|
||||
config.temperature_critical_threshold,
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
name: "cpu".to_string(),
|
||||
load_thresholds,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate CPU load status using configured thresholds
|
||||
/// Calculate CPU load status using thresholds
|
||||
fn calculate_load_status(&self, load: f32) -> Status {
|
||||
if load >= self.config.load_critical_threshold {
|
||||
if load >= self.load_thresholds.critical_high {
|
||||
Status::Critical
|
||||
} else if load >= self.config.load_warning_threshold {
|
||||
Status::Warning
|
||||
} else if load >= self.load_thresholds.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate CPU temperature status using configured thresholds
|
||||
/// Calculate CPU temperature status using thresholds
|
||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||
if temp >= self.config.temperature_critical_threshold {
|
||||
if temp >= self.temperature_thresholds.critical_high {
|
||||
Status::Critical
|
||||
} else if temp >= self.config.temperature_warning_threshold {
|
||||
} else if temp >= self.temperature_thresholds.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect CPU load averages from /proc/loadavg
|
||||
/// Collect CPU load averages and populate AgentData
|
||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_load_averages(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||
|
||||
@@ -66,53 +77,25 @@ impl CpuCollector {
|
||||
let load_5min = utils::parse_f32(parts[1])?;
|
||||
let load_15min = utils::parse_f32(parts[2])?;
|
||||
|
||||
// Only apply thresholds to 5-minute load average
|
||||
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
|
||||
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||
// Populate CPU data directly
|
||||
agent_data.system.cpu.load_1min = load_1min;
|
||||
agent_data.system.cpu.load_5min = load_5min;
|
||||
agent_data.system.cpu.load_15min = load_15min;
|
||||
|
||||
Ok(vec![
|
||||
Metric::new(
|
||||
registry::CPU_LOAD_1MIN.to_string(),
|
||||
MetricValue::Float(load_1min),
|
||||
load_1min_status,
|
||||
)
|
||||
.with_description("CPU load average over 1 minute".to_string()),
|
||||
Metric::new(
|
||||
registry::CPU_LOAD_5MIN.to_string(),
|
||||
MetricValue::Float(load_5min),
|
||||
load_5min_status,
|
||||
)
|
||||
.with_description("CPU load average over 5 minutes".to_string()),
|
||||
Metric::new(
|
||||
registry::CPU_LOAD_15MIN.to_string(),
|
||||
MetricValue::Float(load_15min),
|
||||
load_15min_status,
|
||||
)
|
||||
.with_description("CPU load average over 15 minutes".to_string()),
|
||||
])
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect CPU temperature from thermal zones
|
||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
|
||||
/// Collect CPU temperature and populate AgentData
|
||||
/// Prioritizes x86_pkg_temp over generic thermal zones
|
||||
async fn collect_temperature(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||
if let Ok(temp) = self
|
||||
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||
.await
|
||||
{
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(temp_celsius);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||
MetricValue::Float(temp_celsius),
|
||||
status,
|
||||
)
|
||||
.with_description("CPU package temperature".to_string())
|
||||
.with_unit("°C".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Fallback: try other thermal zones
|
||||
@@ -120,22 +103,14 @@ impl CpuCollector {
|
||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(temp_celsius);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||
MetricValue::Float(temp_celsius),
|
||||
status,
|
||||
)
|
||||
.with_description(format!("CPU temperature from thermal_zone{}", zone_id))
|
||||
.with_unit("°C".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
debug!("No CPU temperature sensors found");
|
||||
Ok(None)
|
||||
// Leave temperature as None if not available
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read temperature from thermal zone efficiently
|
||||
@@ -144,24 +119,16 @@ impl CpuCollector {
|
||||
utils::parse_u64(content.trim())
|
||||
}
|
||||
|
||||
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
|
||||
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
|
||||
/// Collect CPU frequency and populate AgentData
|
||||
async fn collect_frequency(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Try scaling frequency first (more accurate for current frequency)
|
||||
if let Ok(freq) =
|
||||
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
|
||||
{
|
||||
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
||||
let freq_mhz = freq_khz as f32 / 1000.0;
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||
MetricValue::Float(freq_mhz),
|
||||
Status::Ok, // Frequency doesn't have status thresholds
|
||||
)
|
||||
.with_description("Current CPU frequency".to_string())
|
||||
.with_unit("MHz".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.frequency_mhz = freq_mhz;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,17 +138,8 @@ impl CpuCollector {
|
||||
if line.starts_with("cpu MHz") {
|
||||
if let Some(freq_str) = line.split(':').nth(1) {
|
||||
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||
MetricValue::Float(freq_mhz),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description(
|
||||
"CPU base frequency from /proc/cpuinfo".to_string(),
|
||||
)
|
||||
.with_unit("MHz".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.frequency_mhz = freq_mhz;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
break; // Only need first CPU entry
|
||||
@@ -190,41 +148,28 @@ impl CpuCollector {
|
||||
}
|
||||
|
||||
debug!("CPU frequency not available");
|
||||
Ok(None)
|
||||
// Leave frequency as 0.0 if not available
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for CpuCollector {
|
||||
fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting CPU metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||
|
||||
// Collect load averages (always available)
|
||||
metrics.extend(self.collect_load_averages().await?);
|
||||
self.collect_load_averages(agent_data).await?;
|
||||
|
||||
// Collect temperature (optional)
|
||||
if let Some(temp_metric) = self.collect_temperature().await? {
|
||||
metrics.push(temp_metric);
|
||||
}
|
||||
self.collect_temperature(agent_data).await?;
|
||||
|
||||
// Collect frequency (optional)
|
||||
if let Some(freq_metric) = self.collect_frequency().await? {
|
||||
metrics.push(freq_metric);
|
||||
}
|
||||
self.collect_frequency(agent_data).await?;
|
||||
|
||||
let duration = start.elapsed();
|
||||
debug!(
|
||||
"CPU collection completed in {:?} with {} metrics",
|
||||
duration,
|
||||
metrics.len()
|
||||
);
|
||||
debug!("CPU collection completed in {:?}", duration);
|
||||
|
||||
// Efficiency check: warn if collection takes too long
|
||||
if duration.as_millis() > 1 {
|
||||
@@ -234,13 +179,14 @@ impl Collector for CpuCollector {
|
||||
);
|
||||
}
|
||||
|
||||
// Store performance metrics
|
||||
// Performance tracking handled by cache system
|
||||
// Calculate status using thresholds
|
||||
agent_data.system.cpu.load_status = self.calculate_load_status(agent_data.system.cpu.load_1min);
|
||||
agent_data.system.cpu.temperature_status = if let Some(temp) = agent_data.system.cpu.temperature_celsius {
|
||||
self.calculate_temperature_status(temp)
|
||||
} else {
|
||||
Status::Unknown
|
||||
};
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
|
||||
None // Performance tracking handled by cache system
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,296 +1,683 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status};
|
||||
|
||||
use crate::config::DiskConfig;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::time::Instant;
|
||||
use std::collections::HashMap;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError, PerformanceMetrics};
|
||||
use super::{Collector, CollectorError};
|
||||
|
||||
/// Information about a mounted disk
|
||||
#[derive(Debug, Clone)]
|
||||
struct MountedDisk {
|
||||
device: String, // e.g., "/dev/nvme0n1p1"
|
||||
physical_device: String, // e.g., "/dev/nvme0n1"
|
||||
mount_point: String, // e.g., "/"
|
||||
filesystem: String, // e.g., "ext4"
|
||||
size: String, // e.g., "120G"
|
||||
used: String, // e.g., "45G"
|
||||
available: String, // e.g., "75G"
|
||||
usage_percent: f32, // e.g., 38.5
|
||||
config_name: Option<String>, // Name from config if UUID-based
|
||||
}
|
||||
|
||||
/// Disk usage collector for monitoring filesystem sizes
|
||||
/// Storage collector with clean architecture and structured data output
|
||||
pub struct DiskCollector {
|
||||
config: DiskConfig,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
/// A physical drive with its filesystems
|
||||
#[derive(Debug, Clone)]
|
||||
struct PhysicalDrive {
|
||||
name: String, // e.g., "nvme0n1", "sda"
|
||||
health: String, // SMART health status
|
||||
filesystems: Vec<Filesystem>, // mounted filesystems on this drive
|
||||
}
|
||||
|
||||
/// A filesystem mounted on a drive
|
||||
#[derive(Debug, Clone)]
|
||||
struct Filesystem {
|
||||
mount_point: String, // e.g., "/", "/boot"
|
||||
usage_percent: f32, // Usage percentage
|
||||
used_bytes: u64, // Used bytes
|
||||
total_bytes: u64, // Total bytes
|
||||
}
|
||||
|
||||
/// MergerFS pool
|
||||
#[derive(Debug, Clone)]
|
||||
struct MergerfsPool {
|
||||
name: String, // e.g., "srv_media"
|
||||
mount_point: String, // e.g., "/srv/media"
|
||||
total_bytes: u64, // Pool total bytes
|
||||
used_bytes: u64, // Pool used bytes
|
||||
data_drives: Vec<PoolDrive>, // Data drives in pool
|
||||
parity_drives: Vec<PoolDrive>, // Parity drives in pool
|
||||
}
|
||||
|
||||
/// Drive in a storage pool
|
||||
#[derive(Debug, Clone)]
|
||||
struct PoolDrive {
|
||||
name: String, // Drive name
|
||||
mount_point: String, // e.g., "/mnt/disk1"
|
||||
temperature_celsius: Option<f32>, // Drive temperature
|
||||
}
|
||||
|
||||
impl DiskCollector {
|
||||
pub fn new(config: DiskConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Resolve UUID to actual device path
|
||||
fn resolve_uuid_to_device(&self, uuid: &str) -> Result<String> {
|
||||
let uuid_path = format!("/dev/disk/by-uuid/{}", uuid);
|
||||
let temperature_thresholds = HysteresisThresholds::new(
|
||||
config.temperature_warning_celsius,
|
||||
config.temperature_critical_celsius,
|
||||
);
|
||||
|
||||
if Path::new(&uuid_path).exists() {
|
||||
match fs::read_link(&uuid_path) {
|
||||
Ok(target) => {
|
||||
// Convert relative path to absolute
|
||||
if target.is_relative() {
|
||||
let parent = Path::new(&uuid_path).parent().unwrap();
|
||||
let resolved = parent.join(&target);
|
||||
match resolved.canonicalize() {
|
||||
Ok(canonical) => Ok(canonical.to_string_lossy().to_string()),
|
||||
Err(_) => Ok(target.to_string_lossy().to_string()),
|
||||
}
|
||||
} else {
|
||||
Ok(target.to_string_lossy().to_string())
|
||||
}
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to resolve UUID {}: {}", uuid, e)),
|
||||
}
|
||||
} else {
|
||||
Err(anyhow::anyhow!("UUID {} not found in /dev/disk/by-uuid/", uuid))
|
||||
Self {
|
||||
config,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get configured filesystems from UUIDs
|
||||
fn get_configured_filesystems(&self) -> Result<Vec<MountedDisk>> {
|
||||
let mut configured_disks = Vec::new();
|
||||
/// Collect all storage data and populate AgentData
|
||||
async fn collect_storage_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Starting clean storage collection");
|
||||
|
||||
for fs_config in &self.config.filesystems {
|
||||
if !fs_config.monitor {
|
||||
continue;
|
||||
// Step 1: Get mount points and their backing devices
|
||||
let mount_devices = self.get_mount_devices().await?;
|
||||
|
||||
// Step 2: Get filesystem usage for each mount point using df
|
||||
let mut filesystem_usage = self.get_filesystem_usage(&mount_devices).map_err(|e| CollectorError::Parse {
|
||||
value: "filesystem usage".to_string(),
|
||||
error: format!("Failed to get filesystem usage: {}", e),
|
||||
})?;
|
||||
|
||||
// Step 2.5: Add MergerFS mount points that weren't in lsblk output
|
||||
self.add_mergerfs_filesystem_usage(&mut filesystem_usage).map_err(|e| CollectorError::Parse {
|
||||
value: "mergerfs filesystem usage".to_string(),
|
||||
error: format!("Failed to get mergerfs filesystem usage: {}", e),
|
||||
})?;
|
||||
|
||||
// Step 3: Detect MergerFS pools
|
||||
let mergerfs_pools = self.detect_mergerfs_pools(&filesystem_usage).map_err(|e| CollectorError::Parse {
|
||||
value: "mergerfs pools".to_string(),
|
||||
error: format!("Failed to detect mergerfs pools: {}", e),
|
||||
})?;
|
||||
|
||||
// Step 4: Group filesystems by physical drive (excluding mergerfs members)
|
||||
let physical_drives = self.group_by_physical_drive(&mount_devices, &filesystem_usage, &mergerfs_pools).map_err(|e| CollectorError::Parse {
|
||||
value: "physical drives".to_string(),
|
||||
error: format!("Failed to group by physical drive: {}", e),
|
||||
})?;
|
||||
|
||||
// Step 5: Get SMART data for all drives
|
||||
let smart_data = self.get_smart_data_for_drives(&physical_drives, &mergerfs_pools).await;
|
||||
|
||||
// Step 6: Populate AgentData
|
||||
self.populate_drives_data(&physical_drives, &smart_data, agent_data)?;
|
||||
self.populate_pools_data(&mergerfs_pools, &smart_data, agent_data)?;
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!("Storage collection completed in {:?}", elapsed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get block devices and their mount points using lsblk
|
||||
async fn get_mount_devices(&self) -> Result<HashMap<String, String>, CollectorError> {
|
||||
let output = Command::new("lsblk")
|
||||
.args(&["-rn", "-o", "NAME,MOUNTPOINT"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "block devices".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let mut mount_devices = HashMap::new();
|
||||
for line in String::from_utf8_lossy(&output.stdout).lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 {
|
||||
let device_name = parts[0];
|
||||
let mount_point = parts[1];
|
||||
|
||||
// Skip swap partitions and unmounted devices
|
||||
if mount_point == "[SWAP]" || mount_point.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert device name to full path
|
||||
let device_path = format!("/dev/{}", device_name);
|
||||
mount_devices.insert(mount_point.to_string(), device_path);
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve UUID to device
|
||||
match self.resolve_uuid_to_device(&fs_config.uuid) {
|
||||
Ok(device_path) => {
|
||||
// Get filesystem stats for the mount point
|
||||
match self.get_filesystem_info(&fs_config.mount_point) {
|
||||
Ok((total_bytes, used_bytes)) => {
|
||||
let available_bytes = total_bytes - used_bytes;
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
debug!("Found {} mounted block devices", mount_devices.len());
|
||||
Ok(mount_devices)
|
||||
}
|
||||
|
||||
// Convert bytes to human-readable format
|
||||
let size = self.bytes_to_human_readable(total_bytes);
|
||||
let used = self.bytes_to_human_readable(used_bytes);
|
||||
let available = self.bytes_to_human_readable(available_bytes);
|
||||
|
||||
// Get physical device for SMART monitoring
|
||||
let physical_device = self.get_physical_device(&device_path)?;
|
||||
|
||||
configured_disks.push(MountedDisk {
|
||||
device: device_path.clone(),
|
||||
physical_device,
|
||||
mount_point: fs_config.mount_point.clone(),
|
||||
filesystem: fs_config.fs_type.clone(),
|
||||
size,
|
||||
used,
|
||||
available,
|
||||
usage_percent: usage_percent as f32,
|
||||
config_name: Some(fs_config.name.clone()),
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Configured filesystem '{}' (UUID: {}) mounted at {} using {}",
|
||||
fs_config.name, fs_config.uuid, fs_config.mount_point, device_path
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"Failed to get filesystem info for configured filesystem '{}': {}",
|
||||
fs_config.name, e
|
||||
);
|
||||
}
|
||||
}
|
||||
/// Use df to get filesystem usage for mount points
|
||||
fn get_filesystem_usage(&self, mount_devices: &HashMap<String, String>) -> anyhow::Result<HashMap<String, (u64, u64)>> {
|
||||
let mut filesystem_usage = HashMap::new();
|
||||
|
||||
for mount_point in mount_devices.keys() {
|
||||
match self.get_filesystem_info(mount_point) {
|
||||
Ok((total, used)) => {
|
||||
filesystem_usage.insert(mount_point.clone(), (total, used));
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"Failed to resolve UUID for configured filesystem '{}': {}",
|
||||
fs_config.name, e
|
||||
);
|
||||
debug!("Failed to get filesystem info for {}: {}", mount_point, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(configured_disks)
|
||||
Ok(filesystem_usage)
|
||||
}
|
||||
|
||||
/// Convert bytes to human-readable format
|
||||
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
||||
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
||||
let mut size = bytes as f64;
|
||||
let mut unit_index = 0;
|
||||
|
||||
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
||||
size /= 1024.0;
|
||||
unit_index += 1;
|
||||
}
|
||||
|
||||
if unit_index == 0 {
|
||||
format!("{:.0}{}", size, UNITS[unit_index])
|
||||
} else {
|
||||
format!("{:.1}{}", size, UNITS[unit_index])
|
||||
/// Add filesystem usage for MergerFS mount points that aren't in lsblk
|
||||
fn add_mergerfs_filesystem_usage(&self, filesystem_usage: &mut HashMap<String, (u64, u64)>) -> anyhow::Result<()> {
|
||||
let mounts_content = std::fs::read_to_string("/proc/mounts")
|
||||
.map_err(|e| anyhow::anyhow!("Failed to read /proc/mounts: {}", e))?;
|
||||
|
||||
for line in mounts_content.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 3 && parts[2] == "fuse.mergerfs" {
|
||||
let mount_point = parts[1].to_string();
|
||||
|
||||
// Only add if we don't already have usage data for this mount point
|
||||
if !filesystem_usage.contains_key(&mount_point) {
|
||||
if let Ok((total, used)) = self.get_filesystem_info(&mount_point) {
|
||||
debug!("Added MergerFS filesystem usage for {}: {}GB total, {}GB used",
|
||||
mount_point, total as f32 / (1024.0 * 1024.0 * 1024.0), used as f32 / (1024.0 * 1024.0 * 1024.0));
|
||||
filesystem_usage.insert(mount_point, (total, used));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get directory size using du command (efficient for single directory)
|
||||
fn get_directory_size(&self, path: &str) -> Result<u64> {
|
||||
let output = Command::new("du")
|
||||
.arg("-s")
|
||||
.arg("--block-size=1")
|
||||
.arg(path)
|
||||
.output()?;
|
||||
|
||||
// du returns success even with permission denied warnings in stderr
|
||||
// We only care if the command completely failed or produced no stdout
|
||||
let output_str = String::from_utf8(output.stdout)?;
|
||||
|
||||
if output_str.trim().is_empty() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"du command produced no output for {}",
|
||||
path
|
||||
));
|
||||
}
|
||||
|
||||
let size_str = output_str
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Failed to parse du output"))?;
|
||||
|
||||
let size_bytes = size_str.parse::<u64>()?;
|
||||
Ok(size_bytes)
|
||||
}
|
||||
|
||||
/// Get filesystem info using df command
|
||||
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
||||
/// Get filesystem info for a single mount point
|
||||
fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> {
|
||||
let output = Command::new("df")
|
||||
.arg("--block-size=1")
|
||||
.arg(path)
|
||||
.output()?;
|
||||
.args(&["--block-size=1", mount_point])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("df {}", mount_point),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("df command failed for {}", path));
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout)?;
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let lines: Vec<&str> = output_str.lines().collect();
|
||||
|
||||
|
||||
if lines.len() < 2 {
|
||||
return Err(anyhow::anyhow!("Unexpected df output format"));
|
||||
return Err(CollectorError::Parse {
|
||||
value: output_str.to_string(),
|
||||
error: "Expected at least 2 lines from df output".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
||||
if fields.len() < 4 {
|
||||
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
||||
// Parse the data line (skip header)
|
||||
let parts: Vec<&str> = lines[1].split_whitespace().collect();
|
||||
if parts.len() < 4 {
|
||||
return Err(CollectorError::Parse {
|
||||
value: lines[1].to_string(),
|
||||
error: "Expected at least 4 fields in df output".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let total_bytes = fields[1].parse::<u64>()?;
|
||||
let used_bytes = fields[2].parse::<u64>()?;
|
||||
let total_bytes: u64 = parts[1].parse().map_err(|e| CollectorError::Parse {
|
||||
value: parts[1].to_string(),
|
||||
error: format!("Failed to parse total bytes: {}", e),
|
||||
})?;
|
||||
|
||||
let used_bytes: u64 = parts[2].parse().map_err(|e| CollectorError::Parse {
|
||||
value: parts[2].to_string(),
|
||||
error: format!("Failed to parse used bytes: {}", e),
|
||||
})?;
|
||||
|
||||
Ok((total_bytes, used_bytes))
|
||||
}
|
||||
|
||||
/// Get root filesystem disk usage
|
||||
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
|
||||
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
|
||||
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
|
||||
Ok((total_bytes, used_bytes, usage_percent as f32))
|
||||
/// Detect MergerFS pools from mount data
|
||||
fn detect_mergerfs_pools(&self, filesystem_usage: &HashMap<String, (u64, u64)>) -> anyhow::Result<Vec<MergerfsPool>> {
|
||||
let mounts_content = std::fs::read_to_string("/proc/mounts")
|
||||
.map_err(|e| anyhow::anyhow!("Failed to read /proc/mounts: {}", e))?;
|
||||
let mut pools = Vec::new();
|
||||
|
||||
for line in mounts_content.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 3 && parts[2] == "fuse.mergerfs" {
|
||||
let mount_point = parts[1].to_string();
|
||||
let device_sources = parts[0]; // e.g., "/mnt/disk1:/mnt/disk2"
|
||||
|
||||
// Get pool usage
|
||||
let (total_bytes, used_bytes) = filesystem_usage.get(&mount_point)
|
||||
.copied()
|
||||
.unwrap_or((0, 0));
|
||||
|
||||
// Extract pool name from mount point (e.g., "/srv/media" -> "srv_media")
|
||||
let pool_name = if mount_point == "/" {
|
||||
"root".to_string()
|
||||
} else {
|
||||
mount_point.trim_start_matches('/').replace('/', "_")
|
||||
};
|
||||
|
||||
if pool_name.is_empty() {
|
||||
debug!("Skipping mergerfs pool with empty name: {}", mount_point);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse member paths - handle both full paths and numeric references
|
||||
let raw_paths: Vec<String> = device_sources
|
||||
.split(':')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
// Convert numeric references to actual mount points if needed
|
||||
let member_paths = if raw_paths.iter().any(|path| !path.starts_with('/')) {
|
||||
// Handle numeric format like "1:2" by finding corresponding /mnt/disk* paths
|
||||
self.resolve_numeric_mergerfs_paths(&raw_paths)?
|
||||
} else {
|
||||
// Already full paths
|
||||
raw_paths
|
||||
};
|
||||
|
||||
// For SnapRAID setups, include parity drives that are related to this pool's data drives
|
||||
let mut all_member_paths = member_paths.clone();
|
||||
let related_parity_paths = self.discover_related_parity_drives(&member_paths)?;
|
||||
all_member_paths.extend(related_parity_paths);
|
||||
|
||||
// Categorize as data vs parity drives
|
||||
let (data_drives, parity_drives) = match self.categorize_pool_drives(&all_member_paths) {
|
||||
Ok(drives) => drives,
|
||||
Err(e) => {
|
||||
debug!("Failed to categorize drives for pool {}: {}. Skipping.", mount_point, e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
pools.push(MergerfsPool {
|
||||
name: pool_name,
|
||||
mount_point,
|
||||
total_bytes,
|
||||
used_bytes,
|
||||
data_drives,
|
||||
parity_drives,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Found {} mergerfs pools", pools.len());
|
||||
Ok(pools)
|
||||
}
|
||||
|
||||
/// Group filesystems by physical drive (excluding mergerfs members) - exact old logic
|
||||
fn group_by_physical_drive(
|
||||
&self,
|
||||
mount_devices: &HashMap<String, String>,
|
||||
filesystem_usage: &HashMap<String, (u64, u64)>,
|
||||
mergerfs_pools: &[MergerfsPool]
|
||||
) -> anyhow::Result<Vec<PhysicalDrive>> {
|
||||
let mut drive_groups: HashMap<String, Vec<Filesystem>> = HashMap::new();
|
||||
|
||||
// Get all mergerfs member paths to exclude them - exactly like old code
|
||||
let mut mergerfs_members = std::collections::HashSet::new();
|
||||
for pool in mergerfs_pools {
|
||||
for drive in &pool.data_drives {
|
||||
mergerfs_members.insert(drive.mount_point.clone());
|
||||
}
|
||||
for drive in &pool.parity_drives {
|
||||
mergerfs_members.insert(drive.mount_point.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Group filesystems by base device
|
||||
for (mount_point, device) in mount_devices {
|
||||
// Skip mergerfs member mounts
|
||||
if mergerfs_members.contains(mount_point) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let base_device = self.extract_base_device(device);
|
||||
|
||||
if let Some((total, used)) = filesystem_usage.get(mount_point) {
|
||||
let usage_percent = (*used as f32 / *total as f32) * 100.0;
|
||||
|
||||
let filesystem = Filesystem {
|
||||
mount_point: mount_point.clone(), // Keep actual mount point like "/" and "/boot"
|
||||
usage_percent,
|
||||
used_bytes: *used,
|
||||
total_bytes: *total,
|
||||
};
|
||||
|
||||
drive_groups.entry(base_device).or_insert_with(Vec::new).push(filesystem);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to PhysicalDrive structs
|
||||
let mut physical_drives = Vec::new();
|
||||
for (drive_name, filesystems) in drive_groups {
|
||||
let physical_drive = PhysicalDrive {
|
||||
name: drive_name,
|
||||
health: "UNKNOWN".to_string(), // Will be updated with SMART data
|
||||
filesystems,
|
||||
};
|
||||
physical_drives.push(physical_drive);
|
||||
}
|
||||
|
||||
physical_drives.sort_by(|a, b| a.name.cmp(&b.name));
|
||||
Ok(physical_drives)
|
||||
}
|
||||
|
||||
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
||||
fn get_physical_device(&self, device: &str) -> Result<String> {
|
||||
// For NVMe: /dev/nvme0n1p1 -> /dev/nvme0n1
|
||||
if device.contains("nvme") && device.contains("p") {
|
||||
if let Some(base) = device.split('p').next() {
|
||||
return Ok(base.to_string());
|
||||
/// Extract base device name from device path
|
||||
fn extract_base_device(&self, device: &str) -> String {
|
||||
// Extract base device name (e.g., "/dev/nvme0n1p1" -> "nvme0n1")
|
||||
if let Some(dev_name) = device.strip_prefix("/dev/") {
|
||||
// Remove partition numbers: nvme0n1p1 -> nvme0n1, sda1 -> sda
|
||||
if let Some(pos) = dev_name.find('p') {
|
||||
if dev_name[pos+1..].chars().all(char::is_numeric) {
|
||||
return dev_name[..pos].to_string();
|
||||
}
|
||||
}
|
||||
// Handle traditional naming: sda1 -> sda
|
||||
let mut result = String::new();
|
||||
for ch in dev_name.chars() {
|
||||
if ch.is_ascii_digit() {
|
||||
break;
|
||||
}
|
||||
result.push(ch);
|
||||
}
|
||||
if !result.is_empty() {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
device.to_string()
|
||||
}
|
||||
|
||||
/// Get SMART data for drives
|
||||
async fn get_smart_data_for_drives(&self, physical_drives: &[PhysicalDrive], mergerfs_pools: &[MergerfsPool]) -> HashMap<String, SmartData> {
|
||||
let mut smart_data = HashMap::new();
|
||||
|
||||
// Collect all drive names
|
||||
let mut all_drives = std::collections::HashSet::new();
|
||||
for drive in physical_drives {
|
||||
all_drives.insert(drive.name.clone());
|
||||
}
|
||||
for pool in mergerfs_pools {
|
||||
for drive in &pool.data_drives {
|
||||
all_drives.insert(drive.name.clone());
|
||||
}
|
||||
for drive in &pool.parity_drives {
|
||||
all_drives.insert(drive.name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// For SATA: /dev/sda1 -> /dev/sda
|
||||
if device.starts_with("/dev/sd") && device.len() > 8 {
|
||||
return Ok(device[..8].to_string()); // Keep /dev/sdX
|
||||
// Get SMART data for each drive
|
||||
for drive_name in all_drives {
|
||||
if let Ok(data) = self.get_smart_data(&drive_name).await {
|
||||
smart_data.insert(drive_name, data);
|
||||
}
|
||||
}
|
||||
|
||||
// For VirtIO: /dev/vda1 -> /dev/vda
|
||||
if device.starts_with("/dev/vd") && device.len() > 8 {
|
||||
return Ok(device[..8].to_string());
|
||||
}
|
||||
|
||||
// If no partition detected, return as-is
|
||||
Ok(device.to_string())
|
||||
smart_data
|
||||
}
|
||||
|
||||
/// Get SMART health for a specific physical device
|
||||
fn get_smart_health(&self, device: &str) -> (String, f32) {
|
||||
if let Ok(output) = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-H")
|
||||
.arg(device)
|
||||
.output()
|
||||
{
|
||||
if output.status.success() {
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let health_status = if output_str.contains("PASSED") {
|
||||
"PASSED"
|
||||
} else if output_str.contains("FAILED") {
|
||||
"FAILED"
|
||||
} else {
|
||||
"UNKNOWN"
|
||||
};
|
||||
/// Get SMART data for a single drive
|
||||
async fn get_smart_data(&self, drive_name: &str) -> Result<SmartData, CollectorError> {
|
||||
// Use direct smartctl (no sudo) - service has CAP_SYS_RAWIO and CAP_SYS_ADMIN capabilities
|
||||
// For NVMe drives, specify device type explicitly
|
||||
let mut cmd = Command::new("smartctl");
|
||||
if drive_name.starts_with("nvme") {
|
||||
cmd.args(&["-d", "nvme", "-a", &format!("/dev/{}", drive_name)]);
|
||||
} else {
|
||||
cmd.args(&["-a", &format!("/dev/{}", drive_name)]);
|
||||
}
|
||||
|
||||
// Try to get temperature
|
||||
let temperature = if let Ok(temp_output) = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-A")
|
||||
.arg(device)
|
||||
.output()
|
||||
{
|
||||
let temp_str = String::from_utf8_lossy(&temp_output.stdout);
|
||||
// Look for temperature in SMART attributes
|
||||
for line in temp_str.lines() {
|
||||
if line.contains("Temperature") && line.contains("Celsius") {
|
||||
if let Some(temp_part) = line.split_whitespace().nth(9) {
|
||||
if let Ok(temp) = temp_part.parse::<f32>() {
|
||||
return (health_status.to_string(), temp);
|
||||
}
|
||||
let output = cmd.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("SMART data for {}", drive_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
if !output.status.success() {
|
||||
// Return unknown data rather than failing completely
|
||||
return Ok(SmartData {
|
||||
health: "UNKNOWN".to_string(),
|
||||
serial_number: None,
|
||||
temperature_celsius: None,
|
||||
wear_percent: None,
|
||||
});
|
||||
}
|
||||
|
||||
let mut health = "UNKNOWN".to_string();
|
||||
let mut serial_number = None;
|
||||
let mut temperature = None;
|
||||
let mut wear_percent = None;
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.contains("SMART overall-health") {
|
||||
if line.contains("PASSED") {
|
||||
health = "PASSED".to_string();
|
||||
} else if line.contains("FAILED") {
|
||||
health = "FAILED".to_string();
|
||||
}
|
||||
}
|
||||
|
||||
// Serial number parsing (both SATA and NVMe)
|
||||
if line.contains("Serial Number:") {
|
||||
if let Some(serial_part) = line.split("Serial Number:").nth(1) {
|
||||
let serial_str = serial_part.trim();
|
||||
if !serial_str.is_empty() {
|
||||
// Take first whitespace-separated token
|
||||
if let Some(serial) = serial_str.split_whitespace().next() {
|
||||
serial_number = Some(serial.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Temperature parsing for different drive types
|
||||
if line.contains("Temperature_Celsius") || line.contains("Airflow_Temperature_Cel") || line.contains("Temperature_Case") {
|
||||
// Traditional SATA drives: attribute table format
|
||||
if let Some(temp_str) = line.split_whitespace().nth(9) {
|
||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||
temperature = Some(temp);
|
||||
}
|
||||
}
|
||||
} else if line.starts_with("Temperature:") {
|
||||
// NVMe drives: simple "Temperature: 27 Celsius" format
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 {
|
||||
if let Ok(temp) = parts[1].parse::<f32>() {
|
||||
temperature = Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wear level parsing for SSDs
|
||||
if line.contains("Media_Wearout_Indicator") {
|
||||
// Media_Wearout_Indicator stores remaining life % in column 3 (VALUE)
|
||||
if let Some(wear_str) = line.split_whitespace().nth(3) {
|
||||
if let Ok(remaining) = wear_str.parse::<f32>() {
|
||||
wear_percent = Some(100.0 - remaining); // Convert remaining life to wear
|
||||
}
|
||||
}
|
||||
} else if line.contains("Wear_Leveling_Count") || line.contains("SSD_Life_Left") {
|
||||
// Other wear attributes store value in column 9 (RAW_VALUE)
|
||||
if let Some(wear_str) = line.split_whitespace().nth(9) {
|
||||
if let Ok(wear) = wear_str.parse::<f32>() {
|
||||
wear_percent = Some(100.0 - wear); // Convert remaining life to wear
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe wear parsing: "Percentage Used: 1%"
|
||||
if line.contains("Percentage Used:") {
|
||||
if let Some(percent_part) = line.split("Percentage Used:").nth(1) {
|
||||
if let Some(percent_str) = percent_part.split_whitespace().next() {
|
||||
if let Some(percent_clean) = percent_str.strip_suffix('%') {
|
||||
if let Ok(wear) = percent_clean.parse::<f32>() {
|
||||
wear_percent = Some(wear);
|
||||
}
|
||||
}
|
||||
}
|
||||
0.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
return (health_status.to_string(), temperature);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
("UNKNOWN".to_string(), 0.0)
|
||||
Ok(SmartData {
|
||||
health,
|
||||
serial_number,
|
||||
temperature_celsius: temperature,
|
||||
wear_percent,
|
||||
})
|
||||
}
|
||||
|
||||
/// Calculate status based on usage percentage
|
||||
fn calculate_usage_status(&self, used_bytes: u64, total_bytes: u64) -> Status {
|
||||
if total_bytes == 0 {
|
||||
return Status::Unknown;
|
||||
/// Populate drives data into AgentData
|
||||
fn populate_drives_data(&self, physical_drives: &[PhysicalDrive], smart_data: &HashMap<String, SmartData>, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
for drive in physical_drives {
|
||||
let smart = smart_data.get(&drive.name);
|
||||
|
||||
let mut filesystems: Vec<FilesystemData> = drive.filesystems.iter().map(|fs| {
|
||||
FilesystemData {
|
||||
mount: fs.mount_point.clone(), // This preserves "/" and "/boot" correctly
|
||||
usage_percent: fs.usage_percent,
|
||||
used_gb: fs.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
total_gb: fs.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
usage_status: self.calculate_filesystem_usage_status(fs.usage_percent),
|
||||
}
|
||||
}).collect();
|
||||
|
||||
// Sort filesystems by mount point for consistent display order
|
||||
filesystems.sort_by(|a, b| a.mount.cmp(&b.mount));
|
||||
|
||||
agent_data.system.storage.drives.push(DriveData {
|
||||
name: drive.name.clone(),
|
||||
serial_number: smart.and_then(|s| s.serial_number.clone()),
|
||||
health: smart.map(|s| s.health.clone()).unwrap_or_else(|| drive.health.clone()),
|
||||
temperature_celsius: smart.and_then(|s| s.temperature_celsius),
|
||||
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||
filesystems,
|
||||
temperature_status: smart.and_then(|s| s.temperature_celsius)
|
||||
.map(|temp| self.calculate_temperature_status(temp))
|
||||
.unwrap_or(Status::Unknown),
|
||||
health_status: self.calculate_health_status(
|
||||
smart.map(|s| s.health.as_str()).unwrap_or("UNKNOWN")
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Thresholds for disk usage
|
||||
/// Populate pools data into AgentData
|
||||
fn populate_pools_data(&self, mergerfs_pools: &[MergerfsPool], smart_data: &HashMap<String, SmartData>, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
for pool in mergerfs_pools {
|
||||
// Calculate pool health and statuses based on member drive health
|
||||
let (pool_health, health_status, usage_status, data_drive_data, parity_drive_data) = self.calculate_pool_health(pool, smart_data);
|
||||
|
||||
let pool_data = PoolData {
|
||||
name: pool.name.clone(),
|
||||
mount: pool.mount_point.clone(),
|
||||
pool_type: format!("mergerfs ({}+{})", pool.data_drives.len(), pool.parity_drives.len()),
|
||||
health: pool_health,
|
||||
usage_percent: if pool.total_bytes > 0 {
|
||||
(pool.used_bytes as f32 / pool.total_bytes as f32) * 100.0
|
||||
} else { 0.0 },
|
||||
used_gb: pool.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
total_gb: pool.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
data_drives: data_drive_data,
|
||||
parity_drives: parity_drive_data,
|
||||
health_status,
|
||||
usage_status,
|
||||
};
|
||||
|
||||
agent_data.system.storage.pools.push(pool_data);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calculate pool health based on member drive status
|
||||
fn calculate_pool_health(&self, pool: &MergerfsPool, smart_data: &HashMap<String, SmartData>) -> (String, cm_dashboard_shared::Status, cm_dashboard_shared::Status, Vec<cm_dashboard_shared::PoolDriveData>, Vec<cm_dashboard_shared::PoolDriveData>) {
|
||||
let mut failed_data = 0;
|
||||
let mut failed_parity = 0;
|
||||
|
||||
// Process data drives
|
||||
let data_drive_data: Vec<cm_dashboard_shared::PoolDriveData> = pool.data_drives.iter().map(|d| {
|
||||
let smart = smart_data.get(&d.name);
|
||||
let health = smart.map(|s| s.health.clone()).unwrap_or_else(|| "UNKNOWN".to_string());
|
||||
let temperature = smart.and_then(|s| s.temperature_celsius).or(d.temperature_celsius);
|
||||
|
||||
if health == "FAILED" {
|
||||
failed_data += 1;
|
||||
}
|
||||
|
||||
// Calculate drive statuses using config thresholds
|
||||
let health_status = self.calculate_health_status(&health);
|
||||
let temperature_status = temperature.map(|t| self.temperature_thresholds.evaluate(t)).unwrap_or(cm_dashboard_shared::Status::Unknown);
|
||||
|
||||
cm_dashboard_shared::PoolDriveData {
|
||||
name: d.name.clone(),
|
||||
serial_number: smart.and_then(|s| s.serial_number.clone()),
|
||||
temperature_celsius: temperature,
|
||||
health,
|
||||
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||
health_status,
|
||||
temperature_status,
|
||||
}
|
||||
}).collect();
|
||||
|
||||
// Process parity drives
|
||||
let parity_drive_data: Vec<cm_dashboard_shared::PoolDriveData> = pool.parity_drives.iter().map(|d| {
|
||||
let smart = smart_data.get(&d.name);
|
||||
let health = smart.map(|s| s.health.clone()).unwrap_or_else(|| "UNKNOWN".to_string());
|
||||
let temperature = smart.and_then(|s| s.temperature_celsius).or(d.temperature_celsius);
|
||||
|
||||
if health == "FAILED" {
|
||||
failed_parity += 1;
|
||||
}
|
||||
|
||||
// Calculate drive statuses using config thresholds
|
||||
let health_status = self.calculate_health_status(&health);
|
||||
let temperature_status = temperature.map(|t| self.temperature_thresholds.evaluate(t)).unwrap_or(cm_dashboard_shared::Status::Unknown);
|
||||
|
||||
cm_dashboard_shared::PoolDriveData {
|
||||
name: d.name.clone(),
|
||||
serial_number: smart.and_then(|s| s.serial_number.clone()),
|
||||
temperature_celsius: temperature,
|
||||
health,
|
||||
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||
health_status,
|
||||
temperature_status,
|
||||
}
|
||||
}).collect();
|
||||
|
||||
// Calculate overall pool health string and status
|
||||
// SnapRAID logic: can tolerate up to N parity drive failures (where N = number of parity drives)
|
||||
// If data drives fail AND we've lost parity protection, that's critical
|
||||
let (pool_health, health_status) = if failed_data == 0 && failed_parity == 0 {
|
||||
("healthy".to_string(), cm_dashboard_shared::Status::Ok)
|
||||
} else if failed_data == 0 && failed_parity > 0 {
|
||||
// Parity failed but no data loss - degraded (reduced protection)
|
||||
("degraded".to_string(), cm_dashboard_shared::Status::Warning)
|
||||
} else if failed_data == 1 && failed_parity == 0 {
|
||||
// One data drive failed, parity intact - degraded (recoverable)
|
||||
("degraded".to_string(), cm_dashboard_shared::Status::Warning)
|
||||
} else {
|
||||
// Multiple data drives failed OR data+parity failed = data loss risk
|
||||
("critical".to_string(), cm_dashboard_shared::Status::Critical)
|
||||
};
|
||||
|
||||
// Calculate pool usage status using config thresholds
|
||||
let usage_percent = if pool.total_bytes > 0 {
|
||||
(pool.used_bytes as f32 / pool.total_bytes as f32) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
let usage_status = if usage_percent >= self.config.usage_critical_percent {
|
||||
cm_dashboard_shared::Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent {
|
||||
cm_dashboard_shared::Status::Warning
|
||||
} else {
|
||||
cm_dashboard_shared::Status::Ok
|
||||
};
|
||||
|
||||
(pool_health, health_status, usage_status, data_drive_data, parity_drive_data)
|
||||
}
|
||||
|
||||
/// Calculate filesystem usage status
|
||||
fn calculate_filesystem_usage_status(&self, usage_percent: f32) -> Status {
|
||||
// Use standard filesystem warning/critical thresholds
|
||||
if usage_percent >= 95.0 {
|
||||
Status::Critical
|
||||
} else if usage_percent >= 85.0 {
|
||||
@@ -300,312 +687,161 @@ impl DiskCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse size string (e.g., "120G", "45M") to GB value
|
||||
fn parse_size_to_gb(&self, size_str: &str) -> f32 {
|
||||
let size_str = size_str.trim();
|
||||
if size_str.is_empty() || size_str == "-" {
|
||||
return 0.0;
|
||||
}
|
||||
/// Calculate drive temperature status
|
||||
fn calculate_temperature_status(&self, temperature: f32) -> Status {
|
||||
self.temperature_thresholds.evaluate(temperature)
|
||||
}
|
||||
|
||||
// Extract numeric part and unit
|
||||
let (num_str, unit) = if let Some(last_char) = size_str.chars().last() {
|
||||
if last_char.is_alphabetic() {
|
||||
let num_part = &size_str[..size_str.len() - 1];
|
||||
let unit_part = &size_str[size_str.len() - 1..];
|
||||
(num_part, unit_part)
|
||||
} else {
|
||||
(size_str, "")
|
||||
/// Calculate drive health status
|
||||
fn calculate_health_status(&self, health: &str) -> Status {
|
||||
match health {
|
||||
"PASSED" => Status::Ok,
|
||||
"FAILED" => Status::Critical,
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
/// Discover parity drives that are related to the given data drives
|
||||
fn discover_related_parity_drives(&self, data_drives: &[String]) -> anyhow::Result<Vec<String>> {
|
||||
let mount_devices = tokio::task::block_in_place(|| {
|
||||
tokio::runtime::Handle::current().block_on(self.get_mount_devices())
|
||||
}).map_err(|e| anyhow::anyhow!("Failed to get mount devices: {}", e))?;
|
||||
|
||||
let mut related_parity = Vec::new();
|
||||
|
||||
// Find parity drives that share the same parent directory as the data drives
|
||||
for data_path in data_drives {
|
||||
if let Some(parent_dir) = self.get_parent_directory(data_path) {
|
||||
// Look for parity drives in the same parent directory
|
||||
for (mount_point, _device) in &mount_devices {
|
||||
if mount_point.contains("parity") && mount_point.starts_with(&parent_dir) {
|
||||
if !related_parity.contains(mount_point) {
|
||||
related_parity.push(mount_point.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
(size_str, "")
|
||||
};
|
||||
|
||||
let number: f32 = num_str.parse().unwrap_or(0.0);
|
||||
|
||||
match unit.to_uppercase().as_str() {
|
||||
"T" | "TB" => number * 1024.0,
|
||||
"G" | "GB" => number,
|
||||
"M" | "MB" => number / 1024.0,
|
||||
"K" | "KB" => number / (1024.0 * 1024.0),
|
||||
"B" | "" => number / (1024.0 * 1024.0 * 1024.0),
|
||||
_ => number, // Assume GB if unknown unit
|
||||
}
|
||||
|
||||
Ok(related_parity)
|
||||
}
|
||||
|
||||
/// Get parent directory of a mount path (e.g., "/mnt/disk1" -> "/mnt")
|
||||
fn get_parent_directory(&self, path: &str) -> Option<String> {
|
||||
if let Some(last_slash) = path.rfind('/') {
|
||||
if last_slash > 0 {
|
||||
return Some(path[..last_slash].to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Categorize pool member drives as data vs parity
|
||||
fn categorize_pool_drives(&self, member_paths: &[String]) -> anyhow::Result<(Vec<PoolDrive>, Vec<PoolDrive>)> {
|
||||
let mut data_drives = Vec::new();
|
||||
let mut parity_drives = Vec::new();
|
||||
|
||||
for path in member_paths {
|
||||
let drive_info = self.get_drive_info_for_path(path)?;
|
||||
|
||||
// Heuristic: if path contains "parity", it's parity
|
||||
if path.to_lowercase().contains("parity") {
|
||||
parity_drives.push(drive_info);
|
||||
} else {
|
||||
data_drives.push(drive_info);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((data_drives, parity_drives))
|
||||
}
|
||||
|
||||
/// Get drive information for a mount path
|
||||
fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> {
|
||||
// Use lsblk to find the backing device
|
||||
let output = Command::new("lsblk")
|
||||
.args(&["-rn", "-o", "NAME,MOUNTPOINT"])
|
||||
.output()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let mut device = String::new();
|
||||
|
||||
for line in output_str.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 && parts[1] == path {
|
||||
device = parts[0].to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if device.is_empty() {
|
||||
return Err(anyhow::anyhow!("Could not find device for path {}", path));
|
||||
}
|
||||
|
||||
// Extract base device name (e.g., "sda1" -> "sda")
|
||||
let base_device = self.extract_base_device(&format!("/dev/{}", device));
|
||||
|
||||
// Get temperature from SMART data if available
|
||||
let temperature = if let Ok(smart_data) = tokio::task::block_in_place(|| {
|
||||
tokio::runtime::Handle::current().block_on(self.get_smart_data(&base_device))
|
||||
}) {
|
||||
smart_data.temperature_celsius
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(PoolDrive {
|
||||
name: base_device,
|
||||
mount_point: path.to_string(),
|
||||
temperature_celsius: temperature,
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve numeric mergerfs references like "1:2" to actual mount paths
|
||||
fn resolve_numeric_mergerfs_paths(&self, numeric_refs: &[String]) -> anyhow::Result<Vec<String>> {
|
||||
let mut resolved_paths = Vec::new();
|
||||
|
||||
// Get all mount points that look like /mnt/disk* or /mnt/parity*
|
||||
let mount_devices = tokio::task::block_in_place(|| {
|
||||
tokio::runtime::Handle::current().block_on(self.get_mount_devices())
|
||||
}).map_err(|e| anyhow::anyhow!("Failed to get mount devices: {}", e))?;
|
||||
|
||||
let mut disk_mounts: Vec<String> = mount_devices.keys()
|
||||
.filter(|path| path.starts_with("/mnt/disk") || path.starts_with("/mnt/parity"))
|
||||
.cloned()
|
||||
.collect();
|
||||
disk_mounts.sort(); // Ensure consistent ordering
|
||||
|
||||
for num_ref in numeric_refs {
|
||||
if let Ok(index) = num_ref.parse::<usize>() {
|
||||
// Convert 1-based index to 0-based
|
||||
if index > 0 && index <= disk_mounts.len() {
|
||||
resolved_paths.push(disk_mounts[index - 1].clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if we couldn't resolve, return the original paths
|
||||
if resolved_paths.is_empty() {
|
||||
resolved_paths = numeric_refs.to_vec();
|
||||
}
|
||||
|
||||
Ok(resolved_paths)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for DiskCollector {
|
||||
fn name(&self) -> &str {
|
||||
"disk"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting multi-disk metrics");
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Use UUID-based configured filesystems
|
||||
let mounted_disks = match self.get_configured_filesystems() {
|
||||
Ok(configured) => {
|
||||
debug!("Using UUID-based filesystems: {} found", configured.len());
|
||||
configured
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get configured filesystems: {}", e);
|
||||
Vec::new()
|
||||
}
|
||||
};
|
||||
|
||||
// Process discovered/configured disks
|
||||
if !mounted_disks.is_empty() {
|
||||
debug!("Found {} mounted disks", mounted_disks.len());
|
||||
|
||||
// Group disks by physical device to avoid duplicate SMART checks
|
||||
let mut physical_devices: std::collections::HashMap<String, Vec<&MountedDisk>> =
|
||||
std::collections::HashMap::new();
|
||||
for disk in &mounted_disks {
|
||||
physical_devices
|
||||
.entry(disk.physical_device.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(disk);
|
||||
}
|
||||
|
||||
// Generate metrics for each mounted disk
|
||||
for (disk_index, disk) in mounted_disks.iter().enumerate() {
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Always use index for metric names to maintain dashboard compatibility
|
||||
let disk_name = disk_index.to_string();
|
||||
|
||||
// Parse size strings to get actual values for calculations
|
||||
let size_gb = self.parse_size_to_gb(&disk.size);
|
||||
let used_gb = self.parse_size_to_gb(&disk.used);
|
||||
let avail_gb = self.parse_size_to_gb(&disk.available);
|
||||
|
||||
// Calculate status based on configured thresholds
|
||||
let status = if disk.usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if disk.usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
// Device and mount point info
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_device", disk_name),
|
||||
value: MetricValue::String(disk.device.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Device: {}", disk.device)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_mount_point", disk_name),
|
||||
value: MetricValue::String(disk.mount_point.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", disk.mount_point)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_filesystem", disk_name),
|
||||
value: MetricValue::String(disk.filesystem.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("FS: {}", disk.filesystem)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Size metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_total_gb", disk_name),
|
||||
value: MetricValue::Float(size_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Total: {}", disk.size)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_used_gb", disk_name),
|
||||
value: MetricValue::Float(used_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Used: {}", disk.used)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_available_gb", disk_name),
|
||||
value: MetricValue::Float(avail_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Available: {}", disk.available)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_usage_percent", disk_name),
|
||||
value: MetricValue::Float(disk.usage_percent),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("Usage: {:.1}%", disk.usage_percent)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Physical device name (for SMART health grouping)
|
||||
let physical_device_name = disk
|
||||
.physical_device
|
||||
.strip_prefix("/dev/")
|
||||
.unwrap_or(&disk.physical_device);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_physical_device", disk_name),
|
||||
value: MetricValue::String(physical_device_name.to_string()),
|
||||
unit: None,
|
||||
description: Some(format!("Physical: {}", physical_device_name)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
|
||||
// Add SMART health metrics for each unique physical device
|
||||
for (physical_device, _disks) in physical_devices {
|
||||
let (health_status, temperature) = self.get_smart_health(&physical_device);
|
||||
let device_name = physical_device
|
||||
.strip_prefix("/dev/")
|
||||
.unwrap_or(&physical_device);
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
let health_status_enum = match health_status.as_str() {
|
||||
"PASSED" => Status::Ok,
|
||||
"FAILED" => Status::Critical,
|
||||
_ => Status::Unknown,
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_smart_{}_health", device_name),
|
||||
value: MetricValue::String(health_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("SMART Health: {}", health_status)),
|
||||
status: health_status_enum,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
if temperature > 0.0 {
|
||||
let temp_status = if temperature >= 70.0 {
|
||||
Status::Critical
|
||||
} else if temperature >= 60.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_smart_{}_temperature", device_name),
|
||||
value: MetricValue::Float(temperature),
|
||||
unit: Some("°C".to_string()),
|
||||
description: Some(format!("Temperature: {:.0}°C", temperature)),
|
||||
status: temp_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add disk count metric
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(mounted_disks.len() as i64),
|
||||
unit: None,
|
||||
description: Some(format!("Total mounted disks: {}", mounted_disks.len())),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
} else {
|
||||
// No disks configured - add zero count metric
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(0),
|
||||
unit: None,
|
||||
description: Some("No disks configured for monitoring".to_string()),
|
||||
status: Status::Warning,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
|
||||
// Monitor /tmp directory size (keep existing functionality)
|
||||
match self.get_directory_size("/tmp") {
|
||||
Ok(tmp_size_bytes) => {
|
||||
let tmp_size_mb = tmp_size_bytes as f64 / (1024.0 * 1024.0);
|
||||
|
||||
// Get /tmp filesystem info (usually tmpfs with 2GB limit)
|
||||
let (total_bytes, _) = match self.get_filesystem_info("/tmp") {
|
||||
Ok((total, used)) => (total, used),
|
||||
Err(_) => {
|
||||
// Fallback: assume 2GB limit for tmpfs
|
||||
(2 * 1024 * 1024 * 1024, tmp_size_bytes)
|
||||
}
|
||||
};
|
||||
|
||||
let total_mb = total_bytes as f64 / (1024.0 * 1024.0);
|
||||
let usage_percent = (tmp_size_bytes as f64 / total_bytes as f64) * 100.0;
|
||||
let status = self.calculate_usage_status(tmp_size_bytes, total_bytes);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "disk_tmp_size_mb".to_string(),
|
||||
value: MetricValue::Float(tmp_size_mb as f32),
|
||||
unit: Some("MB".to_string()),
|
||||
description: Some(format!("Used: {:.1} MB", tmp_size_mb)),
|
||||
status,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "disk_tmp_total_mb".to_string(),
|
||||
value: MetricValue::Float(total_mb as f32),
|
||||
unit: Some("MB".to_string()),
|
||||
description: Some(format!("Total: {:.1} MB", total_mb)),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "disk_tmp_usage_percent".to_string(),
|
||||
value: MetricValue::Float(usage_percent as f32),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("Usage: {:.1}%", usage_percent)),
|
||||
status,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get /tmp size: {}", e);
|
||||
metrics.push(Metric {
|
||||
name: "disk_tmp_size_mb".to_string(),
|
||||
value: MetricValue::String("error".to_string()),
|
||||
unit: Some("MB".to_string()),
|
||||
description: Some(format!("Error: {}", e)),
|
||||
status: Status::Unknown,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let collection_time = start_time.elapsed();
|
||||
debug!(
|
||||
"Multi-disk collection completed in {:?} with {} metrics",
|
||||
collection_time,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
||||
None // Performance tracking handled by cache system
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
self.collect_storage_data(agent_data).await
|
||||
}
|
||||
}
|
||||
|
||||
/// SMART data for a drive
|
||||
#[derive(Debug, Clone)]
|
||||
struct SmartData {
|
||||
health: String,
|
||||
serial_number: Option<String>,
|
||||
temperature_celsius: Option<f32>,
|
||||
wear_percent: Option<f32>,
|
||||
}
|
||||
1327
agent/src/collectors/disk_old.rs
Normal file
1327
agent/src/collectors/disk_old.rs
Normal file
@@ -0,0 +1,1327 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use crate::config::DiskConfig;
|
||||
use std::process::Command;
|
||||
use std::time::Instant;
|
||||
use std::fs;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
|
||||
/// Mount point information from /proc/mounts
|
||||
#[derive(Debug, Clone)]
|
||||
struct MountInfo {
|
||||
device: String, // e.g., "/dev/sda1" or "/mnt/disk1:/mnt/disk2"
|
||||
mount_point: String, // e.g., "/", "/srv/media"
|
||||
fs_type: String, // e.g., "ext4", "xfs", "fuse.mergerfs"
|
||||
}
|
||||
|
||||
/// Auto-discovered storage topology
|
||||
#[derive(Debug, Clone)]
|
||||
struct StorageTopology {
|
||||
single_disks: Vec<MountInfo>,
|
||||
mergerfs_pools: Vec<MergerfsPoolInfo>,
|
||||
}
|
||||
|
||||
/// MergerFS pool information
|
||||
#[derive(Debug, Clone)]
|
||||
struct MergerfsPoolInfo {
|
||||
mount_point: String, // e.g., "/srv/media"
|
||||
data_members: Vec<String>, // e.g., ["/mnt/disk1", "/mnt/disk2"]
|
||||
parity_disks: Vec<String>, // e.g., ["/mnt/parity"]
|
||||
}
|
||||
|
||||
/// Information about a storage pool (mount point with underlying drives)
|
||||
#[derive(Debug, Clone)]
|
||||
struct StoragePool {
|
||||
name: String, // e.g., "steampool", "root"
|
||||
mount_point: String, // e.g., "/mnt/steampool", "/"
|
||||
filesystem: String, // e.g., "mergerfs", "ext4", "zfs", "btrfs"
|
||||
pool_type: StoragePoolType, // Enhanced pool type with configuration
|
||||
size: String, // e.g., "2.5TB"
|
||||
used: String, // e.g., "2.1TB"
|
||||
available: String, // e.g., "400GB"
|
||||
usage_percent: f32, // e.g., 85.0
|
||||
underlying_drives: Vec<DriveInfo>, // Individual physical drives
|
||||
pool_health: PoolHealth, // Overall pool health status
|
||||
}
|
||||
|
||||
/// Enhanced storage pool types with specific configurations
|
||||
#[derive(Debug, Clone)]
|
||||
enum StoragePoolType {
|
||||
Single, // Traditional single disk (legacy)
|
||||
PhysicalDrive { // Physical drive with multiple filesystems
|
||||
filesystems: Vec<String>, // Mount points on this drive
|
||||
},
|
||||
MergerfsPool { // MergerFS with optional parity
|
||||
data_disks: Vec<String>, // Member disk names (sdb, sdd)
|
||||
parity_disks: Vec<String>, // Parity disk names (sdc)
|
||||
},
|
||||
#[allow(dead_code)]
|
||||
RaidArray { // Hardware RAID (future)
|
||||
level: String, // "RAID1", "RAID5", etc.
|
||||
member_disks: Vec<String>,
|
||||
spare_disks: Vec<String>,
|
||||
},
|
||||
#[allow(dead_code)]
|
||||
ZfsPool { // ZFS pool (future)
|
||||
pool_name: String,
|
||||
vdevs: Vec<String>,
|
||||
}
|
||||
}
|
||||
|
||||
/// Pool health status for redundant storage
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum PoolHealth {
|
||||
Healthy, // All drives OK, parity current
|
||||
Degraded, // One drive failed or parity outdated, still functional
|
||||
Critical, // Multiple failures, data at risk
|
||||
#[allow(dead_code)]
|
||||
Rebuilding, // Actively rebuilding/scrubbing (future: SnapRAID status integration)
|
||||
Unknown, // Cannot determine status
|
||||
}
|
||||
|
||||
/// Information about an individual physical drive
|
||||
#[derive(Debug, Clone)]
|
||||
struct DriveInfo {
|
||||
device: String, // e.g., "sda", "nvme0n1"
|
||||
health_status: String, // e.g., "PASSED", "FAILED"
|
||||
temperature: Option<f32>, // e.g., 45.0°C
|
||||
wear_level: Option<f32>, // e.g., 12.0% (for SSDs)
|
||||
}
|
||||
|
||||
/// Disk usage collector for monitoring filesystem sizes
|
||||
pub struct DiskCollector {
|
||||
config: DiskConfig,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
detected_devices: std::collections::HashMap<String, Vec<String>>, // mount_point -> devices
|
||||
storage_topology: Option<StorageTopology>, // Auto-discovered storage layout
|
||||
}
|
||||
|
||||
impl DiskCollector {
|
||||
pub fn new(config: DiskConfig) -> Self {
|
||||
// Create hysteresis thresholds for disk temperature from config
|
||||
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
config.temperature_warning_celsius,
|
||||
5.0, // 5°C gap for recovery
|
||||
config.temperature_critical_celsius,
|
||||
5.0, // 5°C gap for recovery
|
||||
);
|
||||
|
||||
// Perform auto-discovery of storage topology
|
||||
let storage_topology = match Self::auto_discover_storage() {
|
||||
Ok(topology) => {
|
||||
debug!("Auto-discovered storage topology: {} single disks, {} mergerfs pools",
|
||||
topology.single_disks.len(), topology.mergerfs_pools.len());
|
||||
Some(topology)
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to auto-discover storage topology: {}", e);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Detect devices for discovered storage
|
||||
let mut detected_devices = std::collections::HashMap::new();
|
||||
if let Some(ref topology) = storage_topology {
|
||||
// Add single disks
|
||||
for disk in &topology.single_disks {
|
||||
if let Ok(devices) = Self::detect_device_for_mount_point_static(&disk.mount_point) {
|
||||
detected_devices.insert(disk.mount_point.clone(), devices);
|
||||
}
|
||||
}
|
||||
|
||||
// Add mergerfs pools and their members
|
||||
for pool in &topology.mergerfs_pools {
|
||||
// Detect devices for the pool itself
|
||||
if let Ok(devices) = Self::detect_device_for_mount_point_static(&pool.mount_point) {
|
||||
detected_devices.insert(pool.mount_point.clone(), devices);
|
||||
}
|
||||
|
||||
// Detect devices for member disks
|
||||
for member in &pool.data_members {
|
||||
if let Ok(devices) = Self::detect_device_for_mount_point_static(member) {
|
||||
detected_devices.insert(member.clone(), devices);
|
||||
}
|
||||
}
|
||||
|
||||
// Detect devices for parity disks
|
||||
for parity in &pool.parity_disks {
|
||||
if let Ok(devices) = Self::detect_device_for_mount_point_static(parity) {
|
||||
detected_devices.insert(parity.clone(), devices);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Fallback: use legacy filesystem config detection
|
||||
for fs_config in &config.filesystems {
|
||||
if fs_config.monitor {
|
||||
if let Ok(devices) = Self::detect_device_for_mount_point_static(&fs_config.mount_point) {
|
||||
detected_devices.insert(fs_config.mount_point.clone(), devices);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Self {
|
||||
config,
|
||||
temperature_thresholds,
|
||||
detected_devices,
|
||||
storage_topology,
|
||||
}
|
||||
}
|
||||
|
||||
/// Auto-discover storage topology by parsing system information
|
||||
fn auto_discover_storage() -> Result<StorageTopology> {
|
||||
let mounts = Self::parse_proc_mounts()?;
|
||||
let mut single_disks = Vec::new();
|
||||
let mut mergerfs_pools = Vec::new();
|
||||
|
||||
// Filter out unwanted filesystem types and mount points
|
||||
let exclude_fs_types = ["tmpfs", "devtmpfs", "sysfs", "proc", "cgroup", "cgroup2", "devpts"];
|
||||
let exclude_mount_prefixes = ["/proc", "/sys", "/dev", "/tmp", "/run"];
|
||||
|
||||
for mount in mounts {
|
||||
// Skip excluded filesystem types
|
||||
if exclude_fs_types.contains(&mount.fs_type.as_str()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip excluded mount point prefixes
|
||||
if exclude_mount_prefixes.iter().any(|prefix| mount.mount_point.starts_with(prefix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
match mount.fs_type.as_str() {
|
||||
"fuse.mergerfs" => {
|
||||
// Parse mergerfs pool
|
||||
let data_members = Self::parse_mergerfs_sources(&mount.device);
|
||||
let parity_disks = Self::detect_parity_disks(&data_members);
|
||||
|
||||
mergerfs_pools.push(MergerfsPoolInfo {
|
||||
mount_point: mount.mount_point.clone(),
|
||||
data_members,
|
||||
parity_disks,
|
||||
});
|
||||
|
||||
debug!("Discovered mergerfs pool at {}", mount.mount_point);
|
||||
}
|
||||
"ext4" | "xfs" | "btrfs" | "ntfs" | "vfat" => {
|
||||
// Check if this mount is part of a mergerfs pool
|
||||
let is_mergerfs_member = mergerfs_pools.iter()
|
||||
.any(|pool| pool.data_members.contains(&mount.mount_point) ||
|
||||
pool.parity_disks.contains(&mount.mount_point));
|
||||
|
||||
if !is_mergerfs_member {
|
||||
debug!("Discovered single disk at {}", mount.mount_point);
|
||||
single_disks.push(mount);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
debug!("Skipping unsupported filesystem type: {}", mount.fs_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(StorageTopology {
|
||||
single_disks,
|
||||
mergerfs_pools,
|
||||
})
|
||||
}
|
||||
|
||||
/// Parse /proc/mounts to get all mount information
|
||||
fn parse_proc_mounts() -> Result<Vec<MountInfo>> {
|
||||
let mounts_content = fs::read_to_string("/proc/mounts")?;
|
||||
let mut mounts = Vec::new();
|
||||
|
||||
for line in mounts_content.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 3 {
|
||||
mounts.push(MountInfo {
|
||||
device: parts[0].to_string(),
|
||||
mount_point: parts[1].to_string(),
|
||||
fs_type: parts[2].to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(mounts)
|
||||
}
|
||||
|
||||
/// Parse mergerfs source string to extract member paths
|
||||
fn parse_mergerfs_sources(source: &str) -> Vec<String> {
|
||||
// MergerFS source format: "/mnt/disk1:/mnt/disk2:/mnt/disk3"
|
||||
source.split(':')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Detect potential parity disks based on data member heuristics
|
||||
fn detect_parity_disks(data_members: &[String]) -> Vec<String> {
|
||||
let mut parity_disks = Vec::new();
|
||||
|
||||
// Heuristic 1: Look for mount points with "parity" in the name
|
||||
if let Ok(mounts) = Self::parse_proc_mounts() {
|
||||
for mount in mounts {
|
||||
if mount.mount_point.to_lowercase().contains("parity") &&
|
||||
(mount.fs_type == "xfs" || mount.fs_type == "ext4") {
|
||||
debug!("Detected parity disk by name: {}", mount.mount_point);
|
||||
parity_disks.push(mount.mount_point);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic 2: Look for sequential device pattern
|
||||
// If data members are /mnt/disk1, /mnt/disk2, look for /mnt/disk* that's not in data
|
||||
if parity_disks.is_empty() {
|
||||
if let Some(pattern) = Self::extract_mount_pattern(data_members) {
|
||||
if let Ok(mounts) = Self::parse_proc_mounts() {
|
||||
for mount in mounts {
|
||||
if mount.mount_point.starts_with(&pattern) &&
|
||||
!data_members.contains(&mount.mount_point) &&
|
||||
(mount.fs_type == "xfs" || mount.fs_type == "ext4") {
|
||||
debug!("Detected parity disk by pattern: {}", mount.mount_point);
|
||||
parity_disks.push(mount.mount_point);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parity_disks
|
||||
}
|
||||
|
||||
/// Extract common mount point pattern from data members
|
||||
fn extract_mount_pattern(data_members: &[String]) -> Option<String> {
|
||||
if data_members.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Find common prefix (e.g., "/mnt/disk" from "/mnt/disk1", "/mnt/disk2")
|
||||
let first = &data_members[0];
|
||||
if let Some(last_slash) = first.rfind('/') {
|
||||
let base = &first[..last_slash + 1]; // Include the slash
|
||||
|
||||
// Check if all members share this base
|
||||
if data_members.iter().all(|member| member.starts_with(base)) {
|
||||
return Some(base.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Calculate disk temperature status using hysteresis thresholds
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||
}
|
||||
|
||||
|
||||
/// Get storage pools using auto-discovered topology or fallback to configuration
|
||||
fn get_configured_storage_pools(&self) -> Result<Vec<StoragePool>> {
|
||||
if let Some(ref topology) = self.storage_topology {
|
||||
self.get_auto_discovered_storage_pools(topology)
|
||||
} else {
|
||||
self.get_legacy_configured_storage_pools()
|
||||
}
|
||||
}
|
||||
|
||||
/// Get storage pools from auto-discovered topology
|
||||
fn get_auto_discovered_storage_pools(&self, topology: &StorageTopology) -> Result<Vec<StoragePool>> {
|
||||
let mut storage_pools = Vec::new();
|
||||
|
||||
// Group single disks by physical drive for unified pool display
|
||||
let grouped_disks = self.group_filesystems_by_physical_drive(&topology.single_disks)?;
|
||||
|
||||
// Process grouped single disks (each physical drive becomes a pool)
|
||||
for (drive_name, filesystems) in grouped_disks {
|
||||
// Create a unified pool for this physical drive
|
||||
let pool = self.create_physical_drive_pool(&drive_name, &filesystems)?;
|
||||
storage_pools.push(pool);
|
||||
}
|
||||
|
||||
// IMPORTANT: Do not create individual filesystem pools when using auto-discovery
|
||||
// All single disk filesystems should be grouped into physical drive pools above
|
||||
|
||||
// Process mergerfs pools (these remain as logical pools)
|
||||
for pool_info in &topology.mergerfs_pools {
|
||||
if let Ok((total_bytes, used_bytes)) = self.get_filesystem_info(&pool_info.mount_point) {
|
||||
let available_bytes = total_bytes - used_bytes;
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
let size = self.bytes_to_human_readable(total_bytes);
|
||||
let used = self.bytes_to_human_readable(used_bytes);
|
||||
let available = self.bytes_to_human_readable(available_bytes);
|
||||
|
||||
// Collect all member and parity drives
|
||||
let mut all_drives = Vec::new();
|
||||
|
||||
// Add data member drives
|
||||
for member in &pool_info.data_members {
|
||||
if let Some(devices) = self.detected_devices.get(member) {
|
||||
all_drives.extend(devices.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Add parity drives
|
||||
for parity in &pool_info.parity_disks {
|
||||
if let Some(devices) = self.detected_devices.get(parity) {
|
||||
all_drives.extend(devices.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let underlying_drives = self.get_drive_info_for_devices(&all_drives)?;
|
||||
|
||||
// Calculate pool health
|
||||
let pool_health = self.calculate_mergerfs_pool_health(&pool_info.data_members, &pool_info.parity_disks, &underlying_drives);
|
||||
|
||||
// Generate pool name from mount point
|
||||
let name = pool_info.mount_point.trim_start_matches('/').replace('/', "_");
|
||||
|
||||
storage_pools.push(StoragePool {
|
||||
name,
|
||||
mount_point: pool_info.mount_point.clone(),
|
||||
filesystem: "fuse.mergerfs".to_string(),
|
||||
pool_type: StoragePoolType::MergerfsPool {
|
||||
data_disks: pool_info.data_members.iter()
|
||||
.filter_map(|member| self.detected_devices.get(member).and_then(|devices| devices.first().cloned()))
|
||||
.collect(),
|
||||
parity_disks: pool_info.parity_disks.iter()
|
||||
.filter_map(|parity| self.detected_devices.get(parity).and_then(|devices| devices.first().cloned()))
|
||||
.collect(),
|
||||
},
|
||||
size,
|
||||
used,
|
||||
available,
|
||||
usage_percent: usage_percent as f32,
|
||||
underlying_drives,
|
||||
pool_health,
|
||||
});
|
||||
|
||||
debug!("Auto-discovered mergerfs pool: {} with {} data + {} parity disks",
|
||||
pool_info.mount_point, pool_info.data_members.len(), pool_info.parity_disks.len());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(storage_pools)
|
||||
}
|
||||
|
||||
/// Group filesystems by their backing physical drive
|
||||
fn group_filesystems_by_physical_drive(&self, filesystems: &[MountInfo]) -> Result<std::collections::HashMap<String, Vec<MountInfo>>> {
|
||||
let mut grouped = std::collections::HashMap::new();
|
||||
|
||||
for fs in filesystems {
|
||||
// Get the physical drive name for this mount point
|
||||
if let Some(devices) = self.detected_devices.get(&fs.mount_point) {
|
||||
if let Some(device_name) = devices.first() {
|
||||
// Extract base drive name from detected device
|
||||
let drive_name = Self::extract_base_device(device_name)
|
||||
.unwrap_or_else(|| device_name.clone());
|
||||
|
||||
debug!("Grouping filesystem {} (device: {}) under drive: {}",
|
||||
fs.mount_point, device_name, drive_name);
|
||||
|
||||
grouped.entry(drive_name).or_insert_with(Vec::new).push(fs.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Filesystem grouping result: {} drives with filesystems: {:?}",
|
||||
grouped.len(),
|
||||
grouped.keys().collect::<Vec<_>>());
|
||||
|
||||
Ok(grouped)
|
||||
}
|
||||
|
||||
/// Create a physical drive pool containing multiple filesystems
|
||||
fn create_physical_drive_pool(&self, drive_name: &str, filesystems: &[MountInfo]) -> Result<StoragePool> {
|
||||
if filesystems.is_empty() {
|
||||
return Err(anyhow::anyhow!("No filesystems for drive {}", drive_name));
|
||||
}
|
||||
|
||||
// Calculate total usage across all filesystems on this drive
|
||||
let mut total_capacity = 0u64;
|
||||
let mut total_used = 0u64;
|
||||
|
||||
for fs in filesystems {
|
||||
if let Ok((capacity, used)) = self.get_filesystem_info(&fs.mount_point) {
|
||||
total_capacity += capacity;
|
||||
total_used += used;
|
||||
}
|
||||
}
|
||||
|
||||
let total_available = total_capacity.saturating_sub(total_used);
|
||||
let usage_percent = if total_capacity > 0 {
|
||||
(total_used as f64 / total_capacity as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
// Get drive information for SMART data
|
||||
let device_names = vec![drive_name.to_string()];
|
||||
let underlying_drives = self.get_drive_info_for_devices(&device_names)?;
|
||||
|
||||
// Collect filesystem mount points for this drive
|
||||
let filesystem_mount_points: Vec<String> = filesystems.iter()
|
||||
.map(|fs| fs.mount_point.clone())
|
||||
.collect();
|
||||
|
||||
Ok(StoragePool {
|
||||
name: drive_name.to_string(),
|
||||
mount_point: format!("(physical drive)"), // Special marker for physical drives
|
||||
filesystem: "physical".to_string(),
|
||||
pool_type: StoragePoolType::PhysicalDrive {
|
||||
filesystems: filesystem_mount_points,
|
||||
},
|
||||
size: self.bytes_to_human_readable(total_capacity),
|
||||
used: self.bytes_to_human_readable(total_used),
|
||||
available: self.bytes_to_human_readable(total_available),
|
||||
usage_percent: usage_percent as f32,
|
||||
pool_health: if underlying_drives.iter().all(|d| d.health_status == "PASSED") {
|
||||
PoolHealth::Healthy
|
||||
} else {
|
||||
PoolHealth::Critical
|
||||
},
|
||||
underlying_drives,
|
||||
})
|
||||
}
|
||||
|
||||
/// Calculate pool health specifically for mergerfs pools
|
||||
fn calculate_mergerfs_pool_health(&self, data_members: &[String], parity_disks: &[String], drives: &[DriveInfo]) -> PoolHealth {
|
||||
// Get device names for data and parity drives
|
||||
let mut data_device_names = Vec::new();
|
||||
let mut parity_device_names = Vec::new();
|
||||
|
||||
for member in data_members {
|
||||
if let Some(devices) = self.detected_devices.get(member) {
|
||||
data_device_names.extend(devices.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for parity in parity_disks {
|
||||
if let Some(devices) = self.detected_devices.get(parity) {
|
||||
parity_device_names.extend(devices.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let failed_data = drives.iter()
|
||||
.filter(|d| data_device_names.contains(&d.device) && d.health_status != "PASSED")
|
||||
.count();
|
||||
let failed_parity = drives.iter()
|
||||
.filter(|d| parity_device_names.contains(&d.device) && d.health_status != "PASSED")
|
||||
.count();
|
||||
|
||||
match (failed_data, failed_parity) {
|
||||
(0, 0) => PoolHealth::Healthy,
|
||||
(1, 0) => PoolHealth::Degraded, // Can recover with parity
|
||||
(0, 1) => PoolHealth::Degraded, // Lost parity protection
|
||||
_ => PoolHealth::Critical, // Multiple failures
|
||||
}
|
||||
}
|
||||
|
||||
/// Fallback to legacy configuration-based storage pools
|
||||
fn get_legacy_configured_storage_pools(&self) -> Result<Vec<StoragePool>> {
|
||||
let mut storage_pools = Vec::new();
|
||||
let mut processed_pools = std::collections::HashSet::new();
|
||||
|
||||
// Legacy implementation: use filesystem configuration
|
||||
for fs_config in &self.config.filesystems {
|
||||
if !fs_config.monitor {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (pool_type, skip_in_single_mode) = self.determine_pool_type(&fs_config.storage_type);
|
||||
|
||||
// Skip member disks if they're part of a pool
|
||||
if skip_in_single_mode {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this pool was already processed (in case of multiple member disks)
|
||||
let pool_key = match &pool_type {
|
||||
StoragePoolType::MergerfsPool { .. } => {
|
||||
// For mergerfs pools, use the main mount point
|
||||
if fs_config.fs_type == "fuse.mergerfs" {
|
||||
fs_config.mount_point.clone()
|
||||
} else {
|
||||
continue; // Skip member disks
|
||||
}
|
||||
}
|
||||
_ => fs_config.mount_point.clone()
|
||||
};
|
||||
|
||||
if processed_pools.contains(&pool_key) {
|
||||
continue;
|
||||
}
|
||||
processed_pools.insert(pool_key.clone());
|
||||
|
||||
// Get filesystem stats for the mount point
|
||||
match self.get_filesystem_info(&fs_config.mount_point) {
|
||||
Ok((total_bytes, used_bytes)) => {
|
||||
let available_bytes = total_bytes - used_bytes;
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
// Convert bytes to human-readable format
|
||||
let size = self.bytes_to_human_readable(total_bytes);
|
||||
let used = self.bytes_to_human_readable(used_bytes);
|
||||
let available = self.bytes_to_human_readable(available_bytes);
|
||||
|
||||
// Get underlying drives based on pool type
|
||||
let underlying_drives = self.get_pool_drives(&pool_type, &fs_config.mount_point)?;
|
||||
|
||||
// Calculate pool health
|
||||
let pool_health = self.calculate_pool_health(&pool_type, &underlying_drives);
|
||||
let drive_count = underlying_drives.len();
|
||||
|
||||
storage_pools.push(StoragePool {
|
||||
name: fs_config.name.clone(),
|
||||
mount_point: fs_config.mount_point.clone(),
|
||||
filesystem: fs_config.fs_type.clone(),
|
||||
pool_type: pool_type.clone(),
|
||||
size,
|
||||
used,
|
||||
available,
|
||||
usage_percent: usage_percent as f32,
|
||||
underlying_drives,
|
||||
pool_health,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Legacy configured storage pool '{}' ({:?}) at {} with {} drives, health: {:?}",
|
||||
fs_config.name, pool_type, fs_config.mount_point, drive_count, pool_health
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"Failed to get filesystem info for storage pool '{}': {}",
|
||||
fs_config.name, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(storage_pools)
|
||||
}
|
||||
|
||||
/// Determine the storage pool type from configuration
|
||||
fn determine_pool_type(&self, storage_type: &str) -> (StoragePoolType, bool) {
|
||||
match storage_type {
|
||||
"single" => (StoragePoolType::Single, false),
|
||||
"mergerfs_pool" | "mergerfs" => {
|
||||
// Find associated member disks
|
||||
let data_disks = self.find_pool_member_disks("mergerfs_member");
|
||||
let parity_disks = self.find_pool_member_disks("parity");
|
||||
(StoragePoolType::MergerfsPool { data_disks, parity_disks }, false)
|
||||
}
|
||||
"mergerfs_member" => (StoragePoolType::Single, true), // Skip, part of pool
|
||||
"parity" => (StoragePoolType::Single, true), // Skip, part of pool
|
||||
"raid1" | "raid5" | "raid6" => {
|
||||
let member_disks = self.find_pool_member_disks(&format!("{}_member", storage_type));
|
||||
(StoragePoolType::RaidArray {
|
||||
level: storage_type.to_uppercase(),
|
||||
member_disks,
|
||||
spare_disks: Vec::new()
|
||||
}, false)
|
||||
}
|
||||
_ => (StoragePoolType::Single, false) // Default to single
|
||||
}
|
||||
}
|
||||
|
||||
/// Find member disks for a specific storage type
|
||||
fn find_pool_member_disks(&self, member_type: &str) -> Vec<String> {
|
||||
let mut member_disks = Vec::new();
|
||||
|
||||
for fs_config in &self.config.filesystems {
|
||||
if fs_config.storage_type == member_type && fs_config.monitor {
|
||||
// Get device names for this mount point
|
||||
if let Some(devices) = self.detected_devices.get(&fs_config.mount_point) {
|
||||
member_disks.extend(devices.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
member_disks
|
||||
}
|
||||
|
||||
/// Get drive information for a specific pool type
|
||||
fn get_pool_drives(&self, pool_type: &StoragePoolType, mount_point: &str) -> Result<Vec<DriveInfo>> {
|
||||
match pool_type {
|
||||
StoragePoolType::Single => {
|
||||
// Single disk - use detected devices for this mount point
|
||||
let device_names = self.detected_devices.get(mount_point).cloned().unwrap_or_default();
|
||||
self.get_drive_info_for_devices(&device_names)
|
||||
}
|
||||
StoragePoolType::PhysicalDrive { .. } => {
|
||||
// Physical drive - get drive info for the drive directly (mount_point not used)
|
||||
let device_names = vec![mount_point.to_string()];
|
||||
self.get_drive_info_for_devices(&device_names)
|
||||
}
|
||||
StoragePoolType::MergerfsPool { data_disks, parity_disks } => {
|
||||
// Mergerfs pool - collect all member drives
|
||||
let mut all_disks = data_disks.clone();
|
||||
all_disks.extend(parity_disks.clone());
|
||||
self.get_drive_info_for_devices(&all_disks)
|
||||
}
|
||||
StoragePoolType::RaidArray { member_disks, spare_disks, .. } => {
|
||||
// RAID array - collect member and spare drives
|
||||
let mut all_disks = member_disks.clone();
|
||||
all_disks.extend(spare_disks.clone());
|
||||
self.get_drive_info_for_devices(&all_disks)
|
||||
}
|
||||
StoragePoolType::ZfsPool { .. } => {
|
||||
// ZFS pool - use detected devices (future implementation)
|
||||
let device_names = self.detected_devices.get(mount_point).cloned().unwrap_or_default();
|
||||
self.get_drive_info_for_devices(&device_names)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate pool health based on drive status and pool type
|
||||
fn calculate_pool_health(&self, pool_type: &StoragePoolType, drives: &[DriveInfo]) -> PoolHealth {
|
||||
match pool_type {
|
||||
StoragePoolType::Single => {
|
||||
// Single disk - health is just the drive health
|
||||
if drives.is_empty() {
|
||||
PoolHealth::Unknown
|
||||
} else if drives.iter().all(|d| d.health_status == "PASSED") {
|
||||
PoolHealth::Healthy
|
||||
} else {
|
||||
PoolHealth::Critical
|
||||
}
|
||||
}
|
||||
StoragePoolType::PhysicalDrive { .. } => {
|
||||
// Physical drive - health is just the drive health (similar to Single)
|
||||
if drives.is_empty() {
|
||||
PoolHealth::Unknown
|
||||
} else if drives.iter().all(|d| d.health_status == "PASSED") {
|
||||
PoolHealth::Healthy
|
||||
} else {
|
||||
PoolHealth::Critical
|
||||
}
|
||||
}
|
||||
StoragePoolType::MergerfsPool { data_disks, parity_disks } => {
|
||||
let failed_data = drives.iter()
|
||||
.filter(|d| data_disks.contains(&d.device) && d.health_status != "PASSED")
|
||||
.count();
|
||||
let failed_parity = drives.iter()
|
||||
.filter(|d| parity_disks.contains(&d.device) && d.health_status != "PASSED")
|
||||
.count();
|
||||
|
||||
match (failed_data, failed_parity) {
|
||||
(0, 0) => PoolHealth::Healthy,
|
||||
(1, 0) => PoolHealth::Degraded, // Can recover with parity
|
||||
(0, 1) => PoolHealth::Degraded, // Lost parity protection
|
||||
_ => PoolHealth::Critical, // Multiple failures
|
||||
}
|
||||
}
|
||||
StoragePoolType::RaidArray { level, .. } => {
|
||||
let failed_drives = drives.iter().filter(|d| d.health_status != "PASSED").count();
|
||||
|
||||
// Basic RAID health logic (can be enhanced per RAID level)
|
||||
match failed_drives {
|
||||
0 => PoolHealth::Healthy,
|
||||
1 if level.contains('1') || level.contains('5') || level.contains('6') => PoolHealth::Degraded,
|
||||
_ => PoolHealth::Critical,
|
||||
}
|
||||
}
|
||||
StoragePoolType::ZfsPool { .. } => {
|
||||
// ZFS health would require zpool status parsing (future)
|
||||
if drives.iter().all(|d| d.health_status == "PASSED") {
|
||||
PoolHealth::Healthy
|
||||
} else {
|
||||
PoolHealth::Degraded
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get drive information for a list of device names
|
||||
fn get_drive_info_for_devices(&self, device_names: &[String]) -> Result<Vec<DriveInfo>> {
|
||||
let mut drives = Vec::new();
|
||||
|
||||
for device_name in device_names {
|
||||
let device_path = format!("/dev/{}", device_name);
|
||||
|
||||
// Get SMART data for this drive
|
||||
let (health_status, temperature, wear_level) = self.get_smart_data(&device_path);
|
||||
|
||||
drives.push(DriveInfo {
|
||||
device: device_name.clone(),
|
||||
health_status: health_status.clone(),
|
||||
temperature,
|
||||
wear_level,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Drive info for {}: health={}, temp={:?}°C, wear={:?}%",
|
||||
device_name, health_status, temperature, wear_level
|
||||
);
|
||||
}
|
||||
|
||||
Ok(drives)
|
||||
}
|
||||
|
||||
/// Get SMART data for a drive (health, temperature, wear level)
|
||||
fn get_smart_data(&self, device_path: &str) -> (String, Option<f32>, Option<f32>) {
|
||||
// Try to get SMART data using smartctl
|
||||
let output = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-a")
|
||||
.arg(device_path)
|
||||
.output();
|
||||
|
||||
match output {
|
||||
Ok(result) if result.status.success() => {
|
||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
||||
|
||||
// Parse health status
|
||||
let health = if stdout.contains("PASSED") {
|
||||
"PASSED".to_string()
|
||||
} else if stdout.contains("FAILED") {
|
||||
"FAILED".to_string()
|
||||
} else {
|
||||
"UNKNOWN".to_string()
|
||||
};
|
||||
|
||||
// Parse temperature (look for various temperature indicators)
|
||||
let temperature = self.parse_temperature_from_smart(&stdout);
|
||||
|
||||
// Parse wear level (for SSDs)
|
||||
let wear_level = self.parse_wear_level_from_smart(&stdout);
|
||||
|
||||
(health, temperature, wear_level)
|
||||
}
|
||||
_ => {
|
||||
debug!("Failed to get SMART data for {}", device_path);
|
||||
("UNKNOWN".to_string(), None, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse temperature from SMART output
|
||||
fn parse_temperature_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
// Look for temperature in various formats
|
||||
if line.contains("Temperature_Celsius") || line.contains("Temperature") {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
if let Ok(temp) = parts[9].parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe drives might show temperature differently
|
||||
if line.contains("temperature:") {
|
||||
if let Some(temp_part) = line.split("temperature:").nth(1) {
|
||||
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse wear level from SMART output (SSD wear leveling)
|
||||
/// Supports both NVMe and SATA SSD wear indicators
|
||||
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
let line = line.trim();
|
||||
|
||||
// NVMe drives - direct percentage used
|
||||
if line.contains("Percentage Used:") {
|
||||
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
||||
if let Some(wear_str) = wear_part.split('%').next() {
|
||||
if let Ok(wear) = wear_str.trim().parse::<f32>() {
|
||||
return Some(wear);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SATA SSD attributes - parse SMART table format
|
||||
// Format: ID ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
// SSD Life Left / Percent Lifetime Remaining (higher = less wear)
|
||||
if line.contains("SSD_Life_Left") || line.contains("Percent_Lifetime_Remain") {
|
||||
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
||||
return Some(100.0 - remaining); // Convert remaining to used
|
||||
}
|
||||
}
|
||||
|
||||
// Media Wearout Indicator (lower = more wear, normalize to 0-100)
|
||||
if line.contains("Media_Wearout_Indicator") {
|
||||
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
||||
return Some(100.0 - remaining); // Convert remaining to used
|
||||
}
|
||||
}
|
||||
|
||||
// Wear Leveling Count (higher = less wear, but varies by manufacturer)
|
||||
if line.contains("Wear_Leveling_Count") {
|
||||
if let Ok(wear_count) = parts[3].parse::<f32>() { // VALUE column
|
||||
// Most SSDs: 100 = new, decreases with wear
|
||||
if wear_count <= 100.0 {
|
||||
return Some(100.0 - wear_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Total LBAs Written - calculate against typical endurance if available
|
||||
// This is more complex and manufacturer-specific, so we skip for now
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Convert bytes to human-readable format
|
||||
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
||||
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
||||
let mut size = bytes as f64;
|
||||
let mut unit_index = 0;
|
||||
|
||||
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
||||
size /= 1024.0;
|
||||
unit_index += 1;
|
||||
}
|
||||
|
||||
if unit_index == 0 {
|
||||
format!("{:.0}{}", size, UNITS[unit_index])
|
||||
} else {
|
||||
format!("{:.1}{}", size, UNITS[unit_index])
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert bytes to gigabytes
|
||||
fn bytes_to_gb(&self, bytes: u64) -> f32 {
|
||||
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
||||
}
|
||||
|
||||
/// Detect device backing a mount point using lsblk (static version for startup)
|
||||
fn detect_device_for_mount_point_static(mount_point: &str) -> Result<Vec<String>> {
|
||||
let output = Command::new("lsblk")
|
||||
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
for line in output_str.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 && parts[1] == mount_point {
|
||||
// Remove tree symbols and extract device name (e.g., "├─nvme0n1p2" -> "nvme0n1p2")
|
||||
let device_name = parts[0]
|
||||
.trim_start_matches('├')
|
||||
.trim_start_matches('└')
|
||||
.trim_start_matches('─')
|
||||
.trim();
|
||||
|
||||
// Extract base device name (e.g., "nvme0n1p2" -> "nvme0n1")
|
||||
if let Some(base_device) = Self::extract_base_device(device_name) {
|
||||
return Ok(vec![base_device]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
/// Extract base device name from partition (e.g., "nvme0n1p2" -> "nvme0n1", "sda1" -> "sda")
|
||||
fn extract_base_device(device_name: &str) -> Option<String> {
|
||||
// Handle NVMe devices (nvme0n1p1 -> nvme0n1)
|
||||
if device_name.starts_with("nvme") {
|
||||
if let Some(p_pos) = device_name.find('p') {
|
||||
return Some(device_name[..p_pos].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Handle traditional devices (sda1 -> sda)
|
||||
if device_name.len() > 1 {
|
||||
let chars: Vec<char> = device_name.chars().collect();
|
||||
let mut end_idx = chars.len();
|
||||
|
||||
// Find where the device name ends and partition number begins
|
||||
for (i, &c) in chars.iter().enumerate().rev() {
|
||||
if !c.is_ascii_digit() {
|
||||
end_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if end_idx > 0 && end_idx < chars.len() {
|
||||
return Some(chars[..end_idx].iter().collect());
|
||||
}
|
||||
}
|
||||
|
||||
// If no partition detected, return as-is
|
||||
Some(device_name.to_string())
|
||||
}
|
||||
|
||||
|
||||
/// Get filesystem info using df command
|
||||
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
||||
let output = Command::new("df")
|
||||
.arg("--block-size=1")
|
||||
.arg(path)
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("df command failed for {}", path));
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout)?;
|
||||
let lines: Vec<&str> = output_str.lines().collect();
|
||||
|
||||
if lines.len() < 2 {
|
||||
return Err(anyhow::anyhow!("Unexpected df output format"));
|
||||
}
|
||||
|
||||
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
||||
if fields.len() < 4 {
|
||||
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
||||
}
|
||||
|
||||
let total_bytes = fields[1].parse::<u64>()?;
|
||||
let used_bytes = fields[2].parse::<u64>()?;
|
||||
|
||||
Ok((total_bytes, used_bytes))
|
||||
}
|
||||
|
||||
|
||||
/// Parse size string (e.g., "120G", "45M") to GB value
|
||||
fn parse_size_to_gb(&self, size_str: &str) -> f32 {
|
||||
let size_str = size_str.trim();
|
||||
if size_str.is_empty() || size_str == "-" {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Extract numeric part and unit
|
||||
let (num_str, unit) = if let Some(last_char) = size_str.chars().last() {
|
||||
if last_char.is_alphabetic() {
|
||||
let num_part = &size_str[..size_str.len() - 1];
|
||||
let unit_part = &size_str[size_str.len() - 1..];
|
||||
(num_part, unit_part)
|
||||
} else {
|
||||
(size_str, "")
|
||||
}
|
||||
} else {
|
||||
(size_str, "")
|
||||
};
|
||||
|
||||
let number: f32 = num_str.parse().unwrap_or(0.0);
|
||||
|
||||
match unit.to_uppercase().as_str() {
|
||||
"T" | "TB" => number * 1024.0,
|
||||
"G" | "GB" => number,
|
||||
"M" | "MB" => number / 1024.0,
|
||||
"K" | "KB" => number / (1024.0 * 1024.0),
|
||||
"B" | "" => number / (1024.0 * 1024.0 * 1024.0),
|
||||
_ => number, // Assume GB if unknown unit
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for DiskCollector {
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting storage pool and individual drive metrics");
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Get configured storage pools with individual drive data
|
||||
let storage_pools = match self.get_configured_storage_pools() {
|
||||
Ok(pools) => {
|
||||
debug!("Found {} storage pools", pools.len());
|
||||
pools
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get storage pools: {}", e);
|
||||
Vec::new()
|
||||
}
|
||||
};
|
||||
|
||||
// Generate metrics for each storage pool and its underlying drives
|
||||
for storage_pool in &storage_pools {
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Storage pool overall metrics
|
||||
let pool_name = &storage_pool.name;
|
||||
|
||||
// Parse size strings to get actual values for calculations
|
||||
let size_gb = self.parse_size_to_gb(&storage_pool.size);
|
||||
let used_gb = self.parse_size_to_gb(&storage_pool.used);
|
||||
let avail_gb = self.parse_size_to_gb(&storage_pool.available);
|
||||
|
||||
// Calculate status based on configured thresholds and pool health
|
||||
let usage_status = if storage_pool.usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if storage_pool.usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
let pool_status = match storage_pool.pool_health {
|
||||
PoolHealth::Critical => Status::Critical,
|
||||
PoolHealth::Degraded => Status::Warning,
|
||||
PoolHealth::Rebuilding => Status::Warning,
|
||||
PoolHealth::Healthy => usage_status,
|
||||
PoolHealth::Unknown => Status::Unknown,
|
||||
};
|
||||
|
||||
// Storage pool info metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_mount_point", pool_name),
|
||||
value: MetricValue::String(storage_pool.mount_point.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", storage_pool.mount_point)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_filesystem", pool_name),
|
||||
value: MetricValue::String(storage_pool.filesystem.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("FS: {}", storage_pool.filesystem)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Enhanced pool type information
|
||||
let pool_type_str = match &storage_pool.pool_type {
|
||||
StoragePoolType::Single => "single".to_string(),
|
||||
StoragePoolType::PhysicalDrive { filesystems } => {
|
||||
format!("drive ({})", filesystems.len())
|
||||
}
|
||||
StoragePoolType::MergerfsPool { data_disks, parity_disks } => {
|
||||
format!("mergerfs ({}+{})", data_disks.len(), parity_disks.len())
|
||||
}
|
||||
StoragePoolType::RaidArray { level, member_disks, spare_disks } => {
|
||||
format!("{} ({}+{})", level, member_disks.len(), spare_disks.len())
|
||||
}
|
||||
StoragePoolType::ZfsPool { pool_name, .. } => {
|
||||
format!("zfs ({})", pool_name)
|
||||
}
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_pool_type", pool_name),
|
||||
value: MetricValue::String(pool_type_str.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Type: {}", pool_type_str)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Pool health status
|
||||
let health_str = match storage_pool.pool_health {
|
||||
PoolHealth::Healthy => "healthy",
|
||||
PoolHealth::Degraded => "degraded",
|
||||
PoolHealth::Critical => "critical",
|
||||
PoolHealth::Rebuilding => "rebuilding",
|
||||
PoolHealth::Unknown => "unknown",
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_pool_health", pool_name),
|
||||
value: MetricValue::String(health_str.to_string()),
|
||||
unit: None,
|
||||
description: Some(format!("Health: {}", health_str)),
|
||||
status: pool_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Storage pool size metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_total_gb", pool_name),
|
||||
value: MetricValue::Float(size_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Total: {}", storage_pool.size)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_used_gb", pool_name),
|
||||
value: MetricValue::Float(used_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Used: {}", storage_pool.used)),
|
||||
status: pool_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_available_gb", pool_name),
|
||||
value: MetricValue::Float(avail_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Available: {}", storage_pool.available)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_usage_percent", pool_name),
|
||||
value: MetricValue::Float(storage_pool.usage_percent),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("Usage: {:.1}%", storage_pool.usage_percent)),
|
||||
status: pool_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Individual drive metrics for this storage pool
|
||||
for drive in &storage_pool.underlying_drives {
|
||||
// Drive health status
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_health", pool_name, drive.device),
|
||||
value: MetricValue::String(drive.health_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("{}: {}", drive.device, drive.health_status)),
|
||||
status: if drive.health_status == "PASSED" { Status::Ok }
|
||||
else if drive.health_status == "FAILED" { Status::Critical }
|
||||
else { Status::Unknown },
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Drive temperature
|
||||
if let Some(temp) = drive.temperature {
|
||||
let temp_status = self.calculate_temperature_status(
|
||||
&format!("disk_{}_{}_temperature", pool_name, drive.device),
|
||||
temp,
|
||||
status_tracker
|
||||
);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_temperature", pool_name, drive.device),
|
||||
value: MetricValue::Float(temp),
|
||||
unit: Some("°C".to_string()),
|
||||
description: Some(format!("{}: {:.0}°C", drive.device, temp)),
|
||||
status: temp_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
|
||||
// Drive wear level (for SSDs)
|
||||
if let Some(wear) = drive.wear_level {
|
||||
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
||||
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
||||
else { Status::Ok };
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_wear_percent", pool_name, drive.device),
|
||||
value: MetricValue::Float(wear),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.0}% wear", drive.device, wear)),
|
||||
status: wear_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Individual filesystem metrics for PhysicalDrive pools
|
||||
if let StoragePoolType::PhysicalDrive { filesystems } = &storage_pool.pool_type {
|
||||
for filesystem_mount in filesystems {
|
||||
if let Ok((total_bytes, used_bytes)) = self.get_filesystem_info(filesystem_mount) {
|
||||
let available_bytes = total_bytes - used_bytes;
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
let filesystem_name = if filesystem_mount == "/" {
|
||||
"root".to_string()
|
||||
} else {
|
||||
filesystem_mount.trim_start_matches('/').replace('/', "_")
|
||||
};
|
||||
|
||||
// Calculate filesystem status based on usage
|
||||
let fs_status = if usage_percent >= self.config.usage_critical_percent as f64 {
|
||||
Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent as f64 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
// Filesystem usage metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_usage_percent", pool_name, filesystem_name),
|
||||
value: MetricValue::Float(usage_percent as f32),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.0}%", filesystem_mount, usage_percent)),
|
||||
status: fs_status.clone(),
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_used_gb", pool_name, filesystem_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(used_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}GB used", filesystem_mount, self.bytes_to_human_readable(used_bytes))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_total_gb", pool_name, filesystem_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(total_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}GB total", filesystem_mount, self.bytes_to_human_readable(total_bytes))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_available_gb", pool_name, filesystem_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(available_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}GB available", filesystem_mount, self.bytes_to_human_readable(available_bytes))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_mount_point", pool_name, filesystem_name),
|
||||
value: MetricValue::String(filesystem_mount.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", filesystem_mount)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add storage pool count metric
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(storage_pools.len() as i64),
|
||||
unit: None,
|
||||
description: Some(format!("Total storage pools: {}", storage_pools.len())),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
|
||||
let collection_time = start_time.elapsed();
|
||||
debug!(
|
||||
"Multi-disk collection completed in {:?} with {} metrics",
|
||||
collection_time,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,7 @@ use thiserror::Error;
|
||||
pub enum CollectorError {
|
||||
#[error("Failed to read system file {path}: {error}")]
|
||||
SystemRead { path: String, error: String },
|
||||
|
||||
|
||||
#[error("Failed to parse value '{value}': {error}")]
|
||||
Parse { value: String, error: String },
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,68 +1,47 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
|
||||
use cm_dashboard_shared::{AgentData, TmpfsData, HysteresisThresholds, Status};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError, utils};
|
||||
use super::{utils, Collector, CollectorError};
|
||||
use crate::config::MemoryConfig;
|
||||
|
||||
/// Extremely efficient memory metrics collector
|
||||
///
|
||||
///
|
||||
/// EFFICIENCY OPTIMIZATIONS:
|
||||
/// - Single /proc/meminfo read for all memory metrics
|
||||
/// - Minimal string parsing with split operations
|
||||
/// - Pre-calculated KB to GB conversion
|
||||
/// - No regex or complex parsing
|
||||
/// - <0.1ms collection time target
|
||||
/// - Minimal string allocations
|
||||
/// - No process spawning for basic metrics
|
||||
/// - <0.5ms collection time target
|
||||
pub struct MemoryCollector {
|
||||
config: MemoryConfig,
|
||||
name: String,
|
||||
}
|
||||
|
||||
/// Memory information parsed from /proc/meminfo
|
||||
#[derive(Debug, Default)]
|
||||
struct MemoryInfo {
|
||||
total_kb: u64,
|
||||
available_kb: u64,
|
||||
free_kb: u64,
|
||||
buffers_kb: u64,
|
||||
cached_kb: u64,
|
||||
swap_total_kb: u64,
|
||||
swap_free_kb: u64,
|
||||
usage_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
impl MemoryCollector {
|
||||
pub fn new(config: MemoryConfig) -> Self {
|
||||
// Create hysteresis thresholds with 10% gap for recovery
|
||||
let usage_thresholds = HysteresisThresholds::new(
|
||||
config.usage_warning_percent,
|
||||
config.usage_critical_percent,
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
name: "memory".to_string(),
|
||||
|
||||
usage_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate memory usage status using configured thresholds
|
||||
fn calculate_usage_status(&self, usage_percent: f32) -> Status {
|
||||
if usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Parse /proc/meminfo efficiently
|
||||
/// Format: "MemTotal: 16384000 kB"
|
||||
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/meminfo")?;
|
||||
let mut info = MemoryInfo::default();
|
||||
|
||||
|
||||
// Parse each line efficiently - only extract what we need
|
||||
for line in content.lines() {
|
||||
if let Some(colon_pos) = line.find(':') {
|
||||
let key = &line[..colon_pos];
|
||||
let value_part = &line[colon_pos + 1..];
|
||||
|
||||
|
||||
// Extract number from value part (format: " 12345 kB")
|
||||
if let Some(number_str) = value_part.split_whitespace().next() {
|
||||
if let Ok(value_kb) = utils::parse_u64(number_str) {
|
||||
@@ -80,7 +59,7 @@ impl MemoryCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate that we got essential fields
|
||||
if info.total_kb == 0 {
|
||||
return Err(CollectorError::Parse {
|
||||
@@ -88,124 +67,174 @@ impl MemoryCollector {
|
||||
error: "MemTotal not found or zero in /proc/meminfo".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// If MemAvailable is not available (older kernels), calculate it
|
||||
if info.available_kb == 0 {
|
||||
info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
|
||||
}
|
||||
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
/// Convert KB to GB efficiently (avoiding floating point in hot path)
|
||||
fn kb_to_gb(kb: u64) -> f32 {
|
||||
kb as f32 / 1_048_576.0 // 1024 * 1024
|
||||
}
|
||||
|
||||
/// Calculate memory metrics from parsed info
|
||||
fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
|
||||
let mut metrics = Vec::with_capacity(6);
|
||||
|
||||
|
||||
/// Populate memory data directly into AgentData
|
||||
async fn populate_memory_data(&self, info: &MemoryInfo, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Calculate derived values
|
||||
let used_kb = info.total_kb - info.available_kb;
|
||||
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
||||
let usage_status = self.calculate_usage_status(usage_percent);
|
||||
let available = info.available_kb;
|
||||
let used = info.total_kb - available;
|
||||
let usage_percent = (used as f32 / info.total_kb as f32) * 100.0;
|
||||
|
||||
// Populate basic memory fields
|
||||
agent_data.system.memory.usage_percent = usage_percent;
|
||||
agent_data.system.memory.total_gb = info.total_kb as f32 / (1024.0 * 1024.0);
|
||||
agent_data.system.memory.used_gb = used as f32 / (1024.0 * 1024.0);
|
||||
|
||||
// Populate swap data if available
|
||||
agent_data.system.memory.swap_total_gb = info.swap_total_kb as f32 / (1024.0 * 1024.0);
|
||||
agent_data.system.memory.swap_used_gb = (info.swap_total_kb - info.swap_free_kb) as f32 / (1024.0 * 1024.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Populate tmpfs data into AgentData
|
||||
async fn populate_tmpfs_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Discover all tmpfs mount points
|
||||
let tmpfs_mounts = self.discover_tmpfs_mounts()?;
|
||||
|
||||
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
||||
|
||||
// Convert to GB for metrics
|
||||
let total_gb = Self::kb_to_gb(info.total_kb);
|
||||
let used_gb = Self::kb_to_gb(used_kb);
|
||||
let available_gb = Self::kb_to_gb(info.available_kb);
|
||||
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
|
||||
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
||||
|
||||
// Memory usage percentage (primary metric with status)
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||
MetricValue::Float(usage_percent),
|
||||
usage_status,
|
||||
).with_description("Memory usage percentage".to_string())
|
||||
.with_unit("%".to_string()));
|
||||
|
||||
// Total memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(total_gb),
|
||||
Status::Ok, // Total memory doesn't have status
|
||||
).with_description("Total system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
// Used memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_USED_GB.to_string(),
|
||||
MetricValue::Float(used_gb),
|
||||
Status::Ok, // Used memory absolute value doesn't have status
|
||||
).with_description("Used system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
// Available memory
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||
MetricValue::Float(available_gb),
|
||||
Status::Ok, // Available memory absolute value doesn't have status
|
||||
).with_description("Available system memory".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
// Swap metrics (only if swap exists)
|
||||
if info.swap_total_kb > 0 {
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(swap_total_gb),
|
||||
Status::Ok,
|
||||
).with_description("Total swap space".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
|
||||
metrics.push(Metric::new(
|
||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||
MetricValue::Float(swap_used_gb),
|
||||
Status::Ok,
|
||||
).with_description("Used swap space".to_string())
|
||||
.with_unit("GB".to_string()));
|
||||
if tmpfs_mounts.is_empty() {
|
||||
debug!("No tmpfs mounts found to monitor");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
metrics
|
||||
|
||||
// Get usage data for all tmpfs mounts at once using df
|
||||
let mut df_args = vec!["df", "--output=target,size,used", "--block-size=1"];
|
||||
df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str()));
|
||||
|
||||
let df_output = std::process::Command::new(df_args[0])
|
||||
.args(&df_args[1..])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "tmpfs mounts".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let df_str = String::from_utf8_lossy(&df_output.stdout);
|
||||
let df_lines: Vec<&str> = df_str.lines().skip(1).collect(); // Skip header
|
||||
|
||||
// Process each tmpfs mount
|
||||
for (i, mount_point) in tmpfs_mounts.iter().enumerate() {
|
||||
if i >= df_lines.len() {
|
||||
debug!("Not enough df output lines for tmpfs mount: {}", mount_point);
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = df_lines[i].split_whitespace().collect();
|
||||
if parts.len() < 3 {
|
||||
debug!("Invalid df output for tmpfs mount: {}", mount_point);
|
||||
continue;
|
||||
}
|
||||
|
||||
let total_bytes: u64 = parts[1].parse().unwrap_or(0);
|
||||
let used_bytes: u64 = parts[2].parse().unwrap_or(0);
|
||||
|
||||
if total_bytes == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
let usage_percent = (used_bytes as f32 / total_bytes as f32) * 100.0;
|
||||
|
||||
// Add to tmpfs list
|
||||
agent_data.system.memory.tmpfs.push(TmpfsData {
|
||||
mount: mount_point.clone(),
|
||||
usage_percent,
|
||||
used_gb,
|
||||
total_gb,
|
||||
});
|
||||
}
|
||||
|
||||
// Sort tmpfs mounts by mount point for consistent display order
|
||||
agent_data.system.memory.tmpfs.sort_by(|a, b| a.mount.cmp(&b.mount));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Discover all tmpfs mount points from /proc/mounts
|
||||
fn discover_tmpfs_mounts(&self) -> Result<Vec<String>, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/mounts")?;
|
||||
let mut tmpfs_mounts = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 3 && fields[2] == "tmpfs" {
|
||||
let mount_point = fields[1];
|
||||
|
||||
// Filter out system/internal tmpfs mounts that aren't useful for monitoring
|
||||
if self.should_monitor_tmpfs(mount_point) {
|
||||
tmpfs_mounts.push(mount_point.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Discovered {} tmpfs mounts: {:?}", tmpfs_mounts.len(), tmpfs_mounts);
|
||||
Ok(tmpfs_mounts)
|
||||
}
|
||||
|
||||
/// Determine if a tmpfs mount point should be monitored
|
||||
fn should_monitor_tmpfs(&self, mount_point: &str) -> bool {
|
||||
// Include commonly useful tmpfs mounts
|
||||
matches!(mount_point,
|
||||
"/tmp" | "/var/tmp" | "/dev/shm" | "/run" | "/var/log"
|
||||
) || mount_point.starts_with("/run/user/") // User session tmpfs
|
||||
}
|
||||
|
||||
/// Calculate memory usage status based on thresholds
|
||||
fn calculate_memory_status(&self, usage_percent: f32) -> Status {
|
||||
self.usage_thresholds.evaluate(usage_percent)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for MemoryCollector {
|
||||
fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting memory metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
|
||||
// Parse memory info from /proc/meminfo
|
||||
let info = self.parse_meminfo().await?;
|
||||
|
||||
// Calculate all metrics from parsed info
|
||||
let metrics = self.calculate_metrics(&info);
|
||||
|
||||
|
||||
// Populate memory data directly
|
||||
self.populate_memory_data(&info, agent_data).await?;
|
||||
|
||||
// Collect tmpfs data
|
||||
self.populate_tmpfs_data(agent_data).await?;
|
||||
|
||||
let duration = start.elapsed();
|
||||
debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
|
||||
|
||||
debug!("Memory collection completed in {:?}", duration);
|
||||
|
||||
// Efficiency check: warn if collection takes too long
|
||||
if duration.as_millis() > 1 {
|
||||
debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
|
||||
debug!(
|
||||
"Memory collection took {}ms - consider optimization",
|
||||
duration.as_millis()
|
||||
);
|
||||
}
|
||||
|
||||
// Store performance metrics
|
||||
// Performance tracking handled by cache system
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
|
||||
None // Performance tracking handled by cache system
|
||||
|
||||
// Calculate status using thresholds
|
||||
agent_data.system.memory.usage_status = self.calculate_memory_status(agent_data.system.memory.usage_percent);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal structure for parsing /proc/meminfo
|
||||
#[derive(Default)]
|
||||
struct MemoryInfo {
|
||||
total_kb: u64,
|
||||
available_kb: u64,
|
||||
free_kb: u64,
|
||||
buffers_kb: u64,
|
||||
cached_kb: u64,
|
||||
swap_total_kb: u64,
|
||||
swap_free_kb: u64,
|
||||
}
|
||||
@@ -1,86 +1,72 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::Metric;
|
||||
use std::time::Duration;
|
||||
use cm_dashboard_shared::{AgentData};
|
||||
|
||||
|
||||
pub mod cpu;
|
||||
pub mod memory;
|
||||
pub mod disk;
|
||||
pub mod systemd;
|
||||
pub mod backup;
|
||||
pub mod cpu;
|
||||
pub mod disk;
|
||||
pub mod error;
|
||||
pub mod memory;
|
||||
pub mod network;
|
||||
pub mod nixos;
|
||||
pub mod systemd;
|
||||
|
||||
pub use error::CollectorError;
|
||||
|
||||
/// Performance metrics for a collector
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PerformanceMetrics {
|
||||
pub last_collection_time: Duration,
|
||||
pub collection_efficiency_percent: f32,
|
||||
}
|
||||
|
||||
/// Base trait for all collectors with extreme efficiency requirements
|
||||
/// Base trait for all collectors with direct structured data output
|
||||
#[async_trait]
|
||||
pub trait Collector: Send + Sync {
|
||||
/// Name of this collector
|
||||
fn name(&self) -> &str;
|
||||
|
||||
/// Collect all metrics this collector provides
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
|
||||
|
||||
/// Get performance metrics for monitoring collector efficiency
|
||||
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
||||
None
|
||||
}
|
||||
/// Collect data and populate AgentData directly with status evaluation
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError>;
|
||||
}
|
||||
|
||||
/// CPU efficiency rules for all collectors
|
||||
pub mod efficiency {
|
||||
/// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
||||
|
||||
/// 1. FILE READING RULES
|
||||
/// - Read entire files in single syscall when possible
|
||||
/// - Use BufReader only for very large files (>4KB)
|
||||
/// - Never read files character by character
|
||||
/// - Cache file descriptors when safe (immutable paths)
|
||||
|
||||
/// 2. PARSING RULES
|
||||
/// - Use split() instead of regex for simple patterns
|
||||
/// - Parse numbers with from_str() not complex parsing
|
||||
/// - Avoid string allocations in hot paths
|
||||
/// - Use str::trim() before parsing numbers
|
||||
|
||||
/// 3. MEMORY ALLOCATION RULES
|
||||
/// - Reuse Vec buffers when possible
|
||||
/// - Pre-allocate collections with known sizes
|
||||
/// - Use str slices instead of String when possible
|
||||
/// - Avoid clone() in hot paths
|
||||
|
||||
/// 4. SYSTEM CALL RULES
|
||||
/// - Minimize syscalls - prefer single reads over multiple
|
||||
/// - Use /proc filesystem efficiently
|
||||
/// - Avoid spawning processes when /proc data available
|
||||
/// - Cache static data (like CPU count)
|
||||
|
||||
/// 5. ERROR HANDLING RULES
|
||||
/// - Use Result<> but minimize allocation in error paths
|
||||
/// - Log errors at debug level only to avoid I/O overhead
|
||||
/// - Graceful degradation - missing metrics better than failing
|
||||
/// - Never panic in collectors
|
||||
|
||||
/// 6. CONCURRENCY RULES
|
||||
/// - Collectors must be thread-safe but avoid locks
|
||||
/// - Use atomic operations for simple counters
|
||||
/// - Avoid shared mutable state between collections
|
||||
/// - Each collection should be independent
|
||||
|
||||
pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
|
||||
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
||||
//!
|
||||
//! # FILE READING RULES
|
||||
//! - Read entire files in single syscall when possible
|
||||
//! - Use BufReader only for very large files (>4KB)
|
||||
//! - Never read files character by character
|
||||
//! - Cache file descriptors when safe (immutable paths)
|
||||
//!
|
||||
//! # PARSING RULES
|
||||
//! - Use split() instead of regex for simple patterns
|
||||
//! - Parse numbers with from_str() not complex parsing
|
||||
//! - Avoid string allocations in hot paths
|
||||
//! - Use str::trim() before parsing numbers
|
||||
//!
|
||||
//! # MEMORY ALLOCATION RULES
|
||||
//! - Reuse Vec buffers when possible
|
||||
//! - Pre-allocate collections with known sizes
|
||||
//! - Use str slices instead of String when possible
|
||||
//! - Avoid clone() in hot paths
|
||||
//!
|
||||
//! # SYSTEM CALL RULES
|
||||
//! - Minimize syscalls - prefer single reads over multiple
|
||||
//! - Use /proc filesystem efficiently
|
||||
//! - Avoid spawning processes when /proc data available
|
||||
//! - Cache static data (like CPU count)
|
||||
//!
|
||||
//! # ERROR HANDLING RULES
|
||||
//! - Use Result<> but minimize allocation in error paths
|
||||
//! - Log errors at debug level only to avoid I/O overhead
|
||||
//! - Graceful degradation - missing metrics better than failing
|
||||
//! - Never panic in collectors
|
||||
//!
|
||||
//! # CONCURRENCY RULES
|
||||
//! - Collectors must be thread-safe but avoid locks
|
||||
//! - Use atomic operations for simple counters
|
||||
//! - Avoid shared mutable state between collections
|
||||
//! - Each collection should be independent
|
||||
}
|
||||
|
||||
/// Utility functions for efficient system data collection
|
||||
pub mod utils {
|
||||
use std::fs;
|
||||
use super::CollectorError;
|
||||
|
||||
use std::fs;
|
||||
|
||||
/// Read entire file content efficiently
|
||||
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
|
||||
fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
|
||||
@@ -88,25 +74,25 @@ pub mod utils {
|
||||
error: e.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Parse float from string slice efficiently
|
||||
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
|
||||
s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
s.trim()
|
||||
.parse()
|
||||
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Parse integer from string slice efficiently
|
||||
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
|
||||
s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
s.trim()
|
||||
.parse()
|
||||
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||
value: s.to_string(),
|
||||
error: e.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Split string and get nth element safely
|
||||
pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
|
||||
s.split(delimiter).nth(n)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
224
agent/src/collectors/network.rs
Normal file
224
agent/src/collectors/network.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{AgentData, NetworkInterfaceData, Status};
|
||||
use std::process::Command;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::NetworkConfig;
|
||||
|
||||
/// Network interface collector with physical/virtual classification and link status
|
||||
pub struct NetworkCollector {
|
||||
_config: NetworkConfig,
|
||||
}
|
||||
|
||||
impl NetworkCollector {
|
||||
pub fn new(config: NetworkConfig) -> Self {
|
||||
Self { _config: config }
|
||||
}
|
||||
|
||||
/// Check if interface is physical (not virtual)
|
||||
fn is_physical_interface(name: &str) -> bool {
|
||||
// Physical interface patterns
|
||||
matches!(
|
||||
&name[..],
|
||||
s if s.starts_with("eth")
|
||||
|| s.starts_with("ens")
|
||||
|| s.starts_with("enp")
|
||||
|| s.starts_with("wlan")
|
||||
|| s.starts_with("wlp")
|
||||
|| s.starts_with("eno")
|
||||
|| s.starts_with("enx")
|
||||
)
|
||||
}
|
||||
|
||||
/// Get link status for an interface
|
||||
fn get_link_status(interface: &str) -> Status {
|
||||
let operstate_path = format!("/sys/class/net/{}/operstate", interface);
|
||||
|
||||
match std::fs::read_to_string(&operstate_path) {
|
||||
Ok(state) => {
|
||||
let state = state.trim();
|
||||
match state {
|
||||
"up" => Status::Ok,
|
||||
"down" => Status::Inactive,
|
||||
"unknown" => Status::Warning,
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
Err(_) => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the primary physical interface (the one with default route)
|
||||
fn get_primary_physical_interface() -> Option<String> {
|
||||
match Command::new("ip").args(["route", "show", "default"]).output() {
|
||||
Ok(output) if output.status.success() => {
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
// Parse: "default via 192.168.1.1 dev eno1 ..."
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("default") {
|
||||
if let Some(dev_pos) = line.find(" dev ") {
|
||||
let after_dev = &line[dev_pos + 5..];
|
||||
if let Some(space_pos) = after_dev.find(' ') {
|
||||
let interface = &after_dev[..space_pos];
|
||||
// Only return if it's a physical interface
|
||||
if Self::is_physical_interface(interface) {
|
||||
return Some(interface.to_string());
|
||||
}
|
||||
} else {
|
||||
// No space after interface name (end of line)
|
||||
let interface = after_dev.trim();
|
||||
if Self::is_physical_interface(interface) {
|
||||
return Some(interface.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse VLAN configuration from /proc/net/vlan/config
|
||||
/// Returns a map of interface name -> VLAN ID
|
||||
fn parse_vlan_config() -> std::collections::HashMap<String, u16> {
|
||||
let mut vlan_map = std::collections::HashMap::new();
|
||||
|
||||
if let Ok(contents) = std::fs::read_to_string("/proc/net/vlan/config") {
|
||||
for line in contents.lines().skip(2) { // Skip header lines
|
||||
let parts: Vec<&str> = line.split('|').collect();
|
||||
if parts.len() >= 2 {
|
||||
let interface_name = parts[0].trim();
|
||||
let vlan_id_str = parts[1].trim();
|
||||
|
||||
if let Ok(vlan_id) = vlan_id_str.parse::<u16>() {
|
||||
vlan_map.insert(interface_name.to_string(), vlan_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vlan_map
|
||||
}
|
||||
|
||||
/// Collect network interfaces using ip command
|
||||
async fn collect_interfaces(&self) -> Vec<NetworkInterfaceData> {
|
||||
let mut interfaces = Vec::new();
|
||||
|
||||
// Parse VLAN configuration
|
||||
let vlan_map = Self::parse_vlan_config();
|
||||
|
||||
match Command::new("ip").args(["-j", "addr"]).output() {
|
||||
Ok(output) if output.status.success() => {
|
||||
let json_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
if let Ok(json_data) = serde_json::from_str::<serde_json::Value>(&json_str) {
|
||||
if let Some(ifaces) = json_data.as_array() {
|
||||
for iface in ifaces {
|
||||
let name = iface["ifname"].as_str().unwrap_or("").to_string();
|
||||
|
||||
// Skip loopback, empty names, and ifb* interfaces
|
||||
if name.is_empty() || name == "lo" || name.starts_with("ifb") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse parent interface from @parent notation (e.g., lan@enp0s31f6)
|
||||
let (interface_name, parent_interface) = if let Some(at_pos) = name.find('@') {
|
||||
let (child, parent) = name.split_at(at_pos);
|
||||
(child.to_string(), Some(parent[1..].to_string()))
|
||||
} else {
|
||||
(name.clone(), None)
|
||||
};
|
||||
|
||||
let mut ipv4_addresses = Vec::new();
|
||||
let mut ipv6_addresses = Vec::new();
|
||||
|
||||
// Extract IP addresses
|
||||
if let Some(addr_info) = iface["addr_info"].as_array() {
|
||||
for addr in addr_info {
|
||||
if let Some(family) = addr["family"].as_str() {
|
||||
if let Some(local) = addr["local"].as_str() {
|
||||
match family {
|
||||
"inet" => ipv4_addresses.push(local.to_string()),
|
||||
"inet6" => {
|
||||
// Skip link-local IPv6 addresses (fe80::)
|
||||
if !local.starts_with("fe80:") {
|
||||
ipv6_addresses.push(local.to_string());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine if physical and get status
|
||||
let is_physical = Self::is_physical_interface(&interface_name);
|
||||
|
||||
// Only filter out virtual interfaces without IPs
|
||||
// Physical interfaces should always be shown even if down/no IPs
|
||||
if !is_physical && ipv4_addresses.is_empty() && ipv6_addresses.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let link_status = if is_physical {
|
||||
Self::get_link_status(&name)
|
||||
} else {
|
||||
Status::Unknown // Virtual interfaces don't have meaningful link status
|
||||
};
|
||||
|
||||
// Look up VLAN ID from the map (use original name before @ parsing)
|
||||
let vlan_id = vlan_map.get(&name).copied();
|
||||
|
||||
interfaces.push(NetworkInterfaceData {
|
||||
name: interface_name,
|
||||
ipv4_addresses,
|
||||
ipv6_addresses,
|
||||
is_physical,
|
||||
link_status,
|
||||
parent_interface,
|
||||
vlan_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to execute ip command: {}", e);
|
||||
}
|
||||
Ok(output) => {
|
||||
debug!("ip command failed with status: {}", output.status);
|
||||
}
|
||||
}
|
||||
|
||||
// Assign primary physical interface as parent to virtual interfaces without explicit parent
|
||||
let primary_interface = Self::get_primary_physical_interface();
|
||||
if let Some(primary) = primary_interface {
|
||||
for interface in interfaces.iter_mut() {
|
||||
// Only assign parent to virtual interfaces that don't already have one
|
||||
if !interface.is_physical && interface.parent_interface.is_none() {
|
||||
interface.parent_interface = Some(primary.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
interfaces
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for NetworkCollector {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting network interface data");
|
||||
|
||||
// Collect all network interfaces
|
||||
let interfaces = self.collect_interfaces().await;
|
||||
|
||||
agent_data.system.network.interfaces = interfaces;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
111
agent/src/collectors/nixos.rs
Normal file
111
agent/src/collectors/nixos.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::AgentData;
|
||||
use std::fs;
|
||||
use std::process::Command;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
|
||||
/// NixOS system information collector with structured data output
|
||||
///
|
||||
/// This collector gathers NixOS-specific information like:
|
||||
/// - System generation/build information
|
||||
/// - Version information
|
||||
/// - Agent version from Nix store path
|
||||
pub struct NixOSCollector;
|
||||
|
||||
impl NixOSCollector {
|
||||
pub fn new(_config: crate::config::NixOSConfig) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
/// Collect NixOS system information and populate AgentData
|
||||
async fn collect_nixos_info(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting NixOS system information");
|
||||
|
||||
// Set hostname (this is universal, not NixOS-specific)
|
||||
agent_data.hostname = self.get_hostname().await.unwrap_or_else(|| "unknown".to_string());
|
||||
|
||||
// Set agent version from environment or Nix store path
|
||||
agent_data.agent_version = self.get_agent_version().await;
|
||||
|
||||
// Set NixOS build/generation information
|
||||
agent_data.build_version = self.get_nixos_generation().await;
|
||||
|
||||
// Set current timestamp
|
||||
agent_data.timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get system hostname
|
||||
async fn get_hostname(&self) -> Option<String> {
|
||||
match fs::read_to_string("/etc/hostname") {
|
||||
Ok(hostname) => Some(hostname.trim().to_string()),
|
||||
Err(_) => {
|
||||
// Fallback to hostname command
|
||||
match Command::new("hostname").output() {
|
||||
Ok(output) => Some(String::from_utf8_lossy(&output.stdout).trim().to_string()),
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get agent version from Nix store path or environment
|
||||
async fn get_agent_version(&self) -> String {
|
||||
// Try to extract version from the current executable path (Nix store)
|
||||
if let Ok(current_exe) = std::env::current_exe() {
|
||||
if let Some(exe_path) = current_exe.to_str() {
|
||||
if exe_path.starts_with("/nix/store/") {
|
||||
// Extract version from Nix store path
|
||||
// Path format: /nix/store/hash-cm-dashboard-agent-v0.1.138/bin/cm-dashboard-agent
|
||||
if let Some(store_part) = exe_path.strip_prefix("/nix/store/") {
|
||||
if let Some(dash_pos) = store_part.find('-') {
|
||||
let package_part = &store_part[dash_pos + 1..];
|
||||
if let Some(bin_pos) = package_part.find("/bin/") {
|
||||
let package_name = &package_part[..bin_pos];
|
||||
// Extract version from package name
|
||||
if let Some(version_start) = package_name.rfind("-v") {
|
||||
return package_name[version_start + 1..].to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to environment variable or default
|
||||
std::env::var("CM_DASHBOARD_VERSION").unwrap_or_else(|_| "unknown".to_string())
|
||||
}
|
||||
|
||||
/// Get NixOS system generation (build) information from git commit
|
||||
async fn get_nixos_generation(&self) -> Option<String> {
|
||||
// Try to read git commit hash from file written during rebuild
|
||||
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
||||
match fs::read_to_string(commit_file) {
|
||||
Ok(content) => {
|
||||
let commit_hash = content.trim();
|
||||
if commit_hash.len() >= 7 {
|
||||
debug!("Found git commit hash: {}", commit_hash);
|
||||
Some(commit_hash.to_string())
|
||||
} else {
|
||||
debug!("Git commit hash too short: {}", commit_hash);
|
||||
None
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to read git commit file {}: {}", commit_file, e);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for NixOSCollector {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
self.collect_nixos_info(agent_data).await
|
||||
}
|
||||
}
|
||||
@@ -1,90 +1,227 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{AgentData, ServiceData, SubServiceData, SubServiceMetric, Status};
|
||||
use std::process::Command;
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError, PerformanceMetrics};
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::SystemdConfig;
|
||||
|
||||
/// Systemd collector for monitoring systemd services
|
||||
/// Systemd collector for monitoring systemd services with structured data output
|
||||
pub struct SystemdCollector {
|
||||
/// Performance tracking
|
||||
last_collection_time: Option<std::time::Duration>,
|
||||
/// Cached state with thread-safe interior mutability
|
||||
state: RwLock<ServiceCacheState>,
|
||||
/// Configuration for service monitoring
|
||||
config: SystemdConfig,
|
||||
}
|
||||
|
||||
/// Internal state for service caching
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceCacheState {
|
||||
/// Last collection time for performance tracking
|
||||
last_collection: Option<Instant>,
|
||||
/// Cached complete service data with sub-services
|
||||
cached_service_data: Vec<ServiceData>,
|
||||
/// Interesting services to monitor (cached after discovery)
|
||||
monitored_services: Vec<String>,
|
||||
/// Cached service status information from discovery
|
||||
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||
/// Last time services were discovered
|
||||
last_discovery_time: Option<Instant>,
|
||||
/// How often to rediscover services (5 minutes)
|
||||
/// How often to rediscover services (from config)
|
||||
discovery_interval_seconds: u64,
|
||||
/// Cached nginx site latency metrics
|
||||
nginx_site_metrics: Vec<Metric>,
|
||||
nginx_site_metrics: Vec<(String, f32)>,
|
||||
/// Last time nginx sites were checked
|
||||
last_nginx_check_time: Option<Instant>,
|
||||
/// How often to check nginx site latency (30 seconds)
|
||||
/// How often to check nginx site latency (configurable)
|
||||
nginx_check_interval_seconds: u64,
|
||||
}
|
||||
|
||||
/// Cached service status information from systemctl list-units
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceStatusInfo {
|
||||
load_state: String,
|
||||
active_state: String,
|
||||
sub_state: String,
|
||||
}
|
||||
|
||||
impl SystemdCollector {
|
||||
pub fn new() -> Self {
|
||||
pub fn new(config: SystemdConfig) -> Self {
|
||||
let state = ServiceCacheState {
|
||||
last_collection: None,
|
||||
cached_service_data: Vec::new(),
|
||||
monitored_services: Vec::new(),
|
||||
service_status_cache: std::collections::HashMap::new(),
|
||||
last_discovery_time: None,
|
||||
discovery_interval_seconds: config.interval_seconds,
|
||||
nginx_site_metrics: Vec::new(),
|
||||
last_nginx_check_time: None,
|
||||
nginx_check_interval_seconds: config.nginx_check_interval_seconds,
|
||||
};
|
||||
|
||||
Self {
|
||||
last_collection_time: None,
|
||||
state: RwLock::new(ServiceCacheState {
|
||||
monitored_services: Vec::new(),
|
||||
last_discovery_time: None,
|
||||
discovery_interval_seconds: 300, // 5 minutes
|
||||
nginx_site_metrics: Vec::new(),
|
||||
last_nginx_check_time: None,
|
||||
nginx_check_interval_seconds: 30, // 30 seconds for nginx sites
|
||||
}),
|
||||
state: RwLock::new(state),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect service data and populate AgentData
|
||||
async fn collect_service_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting systemd services metrics");
|
||||
|
||||
// Get cached services (discovery only happens when needed)
|
||||
let monitored_services = match self.get_monitored_services() {
|
||||
Ok(services) => services,
|
||||
Err(e) => {
|
||||
debug!("Failed to get monitored services: {}", e);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
// Collect service data for each monitored service
|
||||
let mut complete_service_data = Vec::new();
|
||||
for service_name in &monitored_services {
|
||||
match self.get_service_status(service_name) {
|
||||
Ok((active_status, _detailed_info)) => {
|
||||
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
||||
|
||||
let mut sub_services = Vec::new();
|
||||
|
||||
// Sub-service metrics for specific services (always include cached results)
|
||||
if service_name.contains("nginx") && active_status == "active" {
|
||||
let nginx_sites = self.get_nginx_site_metrics();
|
||||
for (site_name, latency_ms) in nginx_sites {
|
||||
let site_status = if latency_ms >= 0.0 && latency_ms < self.config.nginx_latency_critical_ms {
|
||||
"active"
|
||||
} else {
|
||||
"failed"
|
||||
};
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
metrics.push(SubServiceMetric {
|
||||
label: "latency_ms".to_string(),
|
||||
value: latency_ms,
|
||||
unit: Some("ms".to_string()),
|
||||
});
|
||||
|
||||
sub_services.push(SubServiceData {
|
||||
name: site_name.clone(),
|
||||
service_status: self.calculate_service_status(&site_name, &site_status),
|
||||
metrics,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if service_name.contains("docker") && active_status == "active" {
|
||||
let docker_containers = self.get_docker_containers();
|
||||
for (container_name, container_status) in docker_containers {
|
||||
// For now, docker containers have no additional metrics
|
||||
// Future: could add memory_mb, cpu_percent, restart_count, etc.
|
||||
let metrics = Vec::new();
|
||||
|
||||
sub_services.push(SubServiceData {
|
||||
name: container_name.clone(),
|
||||
service_status: self.calculate_service_status(&container_name, &container_status),
|
||||
metrics,
|
||||
});
|
||||
}
|
||||
|
||||
// Add Docker images
|
||||
let docker_images = self.get_docker_images();
|
||||
for (image_name, image_status, image_size) in docker_images {
|
||||
let mut metrics = Vec::new();
|
||||
metrics.push(SubServiceMetric {
|
||||
label: "size".to_string(),
|
||||
value: 0.0, // Size as string in name instead
|
||||
unit: None,
|
||||
});
|
||||
|
||||
sub_services.push(SubServiceData {
|
||||
name: format!("{} ({})", image_name, image_size),
|
||||
service_status: self.calculate_service_status(&image_name, &image_status),
|
||||
metrics,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Create complete service data
|
||||
let service_data = ServiceData {
|
||||
name: service_name.clone(),
|
||||
memory_mb,
|
||||
disk_gb,
|
||||
user_stopped: false, // TODO: Integrate with service tracker
|
||||
service_status: self.calculate_service_status(service_name, &active_status),
|
||||
sub_services,
|
||||
};
|
||||
|
||||
// Add to AgentData and cache
|
||||
agent_data.services.push(service_data.clone());
|
||||
complete_service_data.push(service_data);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get status for service {}: {}", service_name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update cached state
|
||||
{
|
||||
let mut state = self.state.write().unwrap();
|
||||
state.last_collection = Some(start_time);
|
||||
state.cached_service_data = complete_service_data;
|
||||
}
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!("Systemd collection completed in {:?} with {} services", elapsed, agent_data.services.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get monitored services, discovering them if needed or cache is expired
|
||||
fn get_monitored_services(&self) -> Result<Vec<String>> {
|
||||
let mut state = self.state.write().unwrap();
|
||||
|
||||
// Check if we need to discover services
|
||||
let needs_discovery = match state.last_discovery_time {
|
||||
None => true, // First time
|
||||
Some(last_time) => {
|
||||
let elapsed = last_time.elapsed().as_secs();
|
||||
elapsed >= state.discovery_interval_seconds
|
||||
// Check if we need discovery without holding the lock
|
||||
let needs_discovery = {
|
||||
let state = self.state.read().unwrap();
|
||||
match state.last_discovery_time {
|
||||
None => true, // First time
|
||||
Some(last_time) => {
|
||||
let elapsed = last_time.elapsed().as_secs();
|
||||
elapsed >= state.discovery_interval_seconds
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if needs_discovery {
|
||||
debug!("Discovering systemd services (cache expired or first run)");
|
||||
match self.discover_services() {
|
||||
Ok(services) => {
|
||||
state.monitored_services = services;
|
||||
state.last_discovery_time = Some(Instant::now());
|
||||
debug!(
|
||||
"Auto-discovered {} services to monitor: {:?}",
|
||||
state.monitored_services.len(),
|
||||
state.monitored_services
|
||||
);
|
||||
match self.discover_services_internal() {
|
||||
Ok((services, status_cache)) => {
|
||||
if let Ok(mut state) = self.state.write() {
|
||||
state.monitored_services = services.clone();
|
||||
state.service_status_cache = status_cache;
|
||||
state.last_discovery_time = Some(Instant::now());
|
||||
debug!("Auto-discovered {} services to monitor: {:?}",
|
||||
state.monitored_services.len(), state.monitored_services);
|
||||
return Ok(services);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to discover services, using cached list: {}", e);
|
||||
// Continue with existing cached services if discovery fails
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return cached services
|
||||
let state = self.state.read().unwrap();
|
||||
Ok(state.monitored_services.clone())
|
||||
}
|
||||
|
||||
/// Get nginx site metrics, checking them if cache is expired
|
||||
fn get_nginx_site_metrics(&self) -> Vec<Metric> {
|
||||
/// Get nginx site metrics, checking them if cache is expired (like old working version)
|
||||
fn get_nginx_site_metrics(&self) -> Vec<(String, f32)> {
|
||||
let mut state = self.state.write().unwrap();
|
||||
|
||||
// Check if we need to refresh nginx site metrics
|
||||
@@ -99,11 +236,7 @@ impl SystemdCollector {
|
||||
if needs_refresh {
|
||||
// Only check nginx sites if nginx service is active
|
||||
if state.monitored_services.iter().any(|s| s.contains("nginx")) {
|
||||
debug!(
|
||||
"Refreshing nginx site latency metrics (interval: {}s)",
|
||||
state.nginx_check_interval_seconds
|
||||
);
|
||||
let fresh_metrics = self.get_nginx_sites();
|
||||
let fresh_metrics = self.get_nginx_sites_internal();
|
||||
state.nginx_site_metrics = fresh_metrics;
|
||||
state.last_nginx_check_time = Some(Instant::now());
|
||||
}
|
||||
@@ -113,196 +246,219 @@ impl SystemdCollector {
|
||||
}
|
||||
|
||||
/// Auto-discover interesting services to monitor
|
||||
fn discover_services(&self) -> Result<Vec<String>> {
|
||||
let output = Command::new("systemctl")
|
||||
.arg("list-units")
|
||||
.arg("--type=service")
|
||||
.arg("--state=running,failed,inactive")
|
||||
.arg("--no-pager")
|
||||
.arg("--plain")
|
||||
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
||||
// First: Get all service unit files
|
||||
let unit_files_output = Command::new("systemctl")
|
||||
.args(&["list-unit-files", "--type=service", "--no-pager", "--plain"])
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl command failed"));
|
||||
if !unit_files_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout)?;
|
||||
// Second: Get runtime status of all units
|
||||
let units_status_output = Command::new("systemctl")
|
||||
.args(&["list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
||||
.output()?;
|
||||
|
||||
if !units_status_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-units command failed"));
|
||||
}
|
||||
|
||||
let unit_files_str = String::from_utf8(unit_files_output.stdout)?;
|
||||
let units_status_str = String::from_utf8(units_status_output.stdout)?;
|
||||
let mut services = Vec::new();
|
||||
|
||||
// Skip setup/certificate services that don't need monitoring (from legacy)
|
||||
let excluded_services = [
|
||||
"mosquitto-certs",
|
||||
"immich-setup",
|
||||
"phpfpm-kryddorten",
|
||||
"phpfpm-mariehall2",
|
||||
"acme-haasp.net",
|
||||
"acme-selfsigned-haasp",
|
||||
"borgbackup",
|
||||
"haasp-site-deploy",
|
||||
"mosquitto-backup",
|
||||
"nginx-config-reload",
|
||||
"sshd-keygen",
|
||||
];
|
||||
let excluded_services = &self.config.excluded_services;
|
||||
let service_name_filters = &self.config.service_name_filters;
|
||||
|
||||
// Define patterns for services we want to monitor (from legacy)
|
||||
let interesting_services = [
|
||||
// Web applications
|
||||
"gitea",
|
||||
"immich",
|
||||
"vaultwarden",
|
||||
"unifi",
|
||||
"wordpress",
|
||||
"nginx",
|
||||
"httpd",
|
||||
// Databases
|
||||
"postgresql",
|
||||
"mysql",
|
||||
"mariadb",
|
||||
"redis",
|
||||
"mongodb",
|
||||
"mongod",
|
||||
// Backup and storage
|
||||
"borg",
|
||||
"rclone",
|
||||
// Container runtimes
|
||||
"docker",
|
||||
// CI/CD services
|
||||
"gitea-actions",
|
||||
"gitea-runner",
|
||||
"actions-runner",
|
||||
// Network services
|
||||
"sshd",
|
||||
"dnsmasq",
|
||||
// MQTT and IoT services
|
||||
"mosquitto",
|
||||
"mqtt",
|
||||
// PHP-FPM services
|
||||
"phpfpm",
|
||||
// Home automation
|
||||
"haasp",
|
||||
// Backup services
|
||||
"backup",
|
||||
// Game servers
|
||||
"ark",
|
||||
];
|
||||
|
||||
for line in output_str.lines() {
|
||||
// Parse all service unit files
|
||||
let mut all_service_names = std::collections::HashSet::new();
|
||||
for line in unit_files_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
debug!("Processing service: '{}'", service_name);
|
||||
|
||||
// Skip excluded services first
|
||||
let mut is_excluded = false;
|
||||
for excluded in &excluded_services {
|
||||
if service_name.contains(excluded) {
|
||||
debug!(
|
||||
"EXCLUDING service '{}' because it matches pattern '{}'",
|
||||
service_name, excluded
|
||||
);
|
||||
is_excluded = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if is_excluded {
|
||||
debug!("Skipping excluded service: '{}'", service_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this service matches our interesting patterns
|
||||
for pattern in &interesting_services {
|
||||
if service_name.contains(pattern) || pattern.contains(service_name) {
|
||||
debug!(
|
||||
"INCLUDING service '{}' because it matches pattern '{}'",
|
||||
service_name, pattern
|
||||
);
|
||||
services.push(service_name.to_string());
|
||||
break;
|
||||
}
|
||||
}
|
||||
all_service_names.insert(service_name.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Always include ssh/sshd if present
|
||||
if !services.iter().any(|s| s.contains("ssh")) {
|
||||
for line in output_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 4 && (fields[0] == "sshd.service" || fields[0] == "ssh.service")
|
||||
{
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
// Parse runtime status for all units
|
||||
let mut status_cache = std::collections::HashMap::new();
|
||||
for line in units_status_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||
|
||||
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||
load_state,
|
||||
active_state,
|
||||
sub_state,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// For services found in unit files but not in runtime status, set default inactive status
|
||||
for service_name in &all_service_names {
|
||||
if !status_cache.contains_key(service_name) {
|
||||
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||
load_state: "not-loaded".to_string(),
|
||||
active_state: "inactive".to_string(),
|
||||
sub_state: "dead".to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Process all discovered services and apply filters
|
||||
for service_name in &all_service_names {
|
||||
// Skip excluded services first
|
||||
let mut is_excluded = false;
|
||||
for excluded in excluded_services {
|
||||
if service_name.contains(excluded) {
|
||||
is_excluded = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if is_excluded {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this service matches our filter patterns (supports wildcards)
|
||||
for pattern in service_name_filters {
|
||||
if self.matches_pattern(service_name, pattern) {
|
||||
services.push(service_name.to_string());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(services)
|
||||
Ok((services, status_cache))
|
||||
}
|
||||
|
||||
/// Get service status using systemctl
|
||||
/// Get service status from cache (if available) or fallback to systemctl
|
||||
fn get_service_status(&self, service: &str) -> Result<(String, String)> {
|
||||
// Try to get status from cache first
|
||||
if let Ok(state) = self.state.read() {
|
||||
if let Some(cached_info) = state.service_status_cache.get(service) {
|
||||
let active_status = cached_info.active_state.clone();
|
||||
let detailed_info = format!(
|
||||
"LoadState={}\nActiveState={}\nSubState={}",
|
||||
cached_info.load_state,
|
||||
cached_info.active_state,
|
||||
cached_info.sub_state
|
||||
);
|
||||
return Ok((active_status, detailed_info));
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to systemctl if not in cache
|
||||
let output = Command::new("systemctl")
|
||||
.arg("is-active")
|
||||
.arg(format!("{}.service", service))
|
||||
.args(&["is-active", &format!("{}.service", service)])
|
||||
.output()?;
|
||||
|
||||
let active_status = String::from_utf8(output.stdout)?.trim().to_string();
|
||||
|
||||
// Get more detailed info
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
.arg("--property=LoadState,ActiveState,SubState")
|
||||
.args(&["show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"])
|
||||
.output()?;
|
||||
|
||||
let detailed_info = String::from_utf8(output.stdout)?;
|
||||
Ok((active_status, detailed_info))
|
||||
}
|
||||
|
||||
/// Calculate service status
|
||||
fn calculate_service_status(&self, active_status: &str) -> Status {
|
||||
match active_status.to_lowercase().as_str() {
|
||||
"active" => Status::Ok,
|
||||
"inactive" | "dead" => Status::Warning,
|
||||
"failed" | "error" => Status::Critical,
|
||||
_ => Status::Unknown,
|
||||
/// Check if service name matches pattern (supports wildcards like nginx*)
|
||||
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
||||
if pattern.contains('*') {
|
||||
if pattern.ends_with('*') {
|
||||
// Pattern like "nginx*" - match if service starts with "nginx"
|
||||
let prefix = &pattern[..pattern.len() - 1];
|
||||
service_name.starts_with(prefix)
|
||||
} else if pattern.starts_with('*') {
|
||||
// Pattern like "*backup" - match if service ends with "backup"
|
||||
let suffix = &pattern[1..];
|
||||
service_name.ends_with(suffix)
|
||||
} else {
|
||||
// Pattern like "nginx*backup" - simple glob matching
|
||||
self.simple_glob_match(service_name, pattern)
|
||||
}
|
||||
} else {
|
||||
// Exact match
|
||||
service_name == pattern
|
||||
}
|
||||
}
|
||||
|
||||
/// Get service memory usage (if available)
|
||||
fn get_service_memory(&self, service: &str) -> Option<f32> {
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
.arg("--property=MemoryCurrent")
|
||||
.output()
|
||||
.ok()?;
|
||||
/// Simple glob matching for patterns with * in the middle
|
||||
fn simple_glob_match(&self, text: &str, pattern: &str) -> bool {
|
||||
let parts: Vec<&str> = pattern.split('*').collect();
|
||||
let mut pos = 0;
|
||||
|
||||
for part in parts {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if let Some(found_pos) = text[pos..].find(part) {
|
||||
pos += found_pos + part.len();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||
/// Get disk usage for a specific service
|
||||
async fn get_service_disk_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||
// Check if this service has configured directory paths
|
||||
if let Some(dirs) = self.config.service_directories.get(service_name) {
|
||||
// Service has configured paths - use the first accessible one
|
||||
for dir in dirs {
|
||||
if let Some(size) = self.get_directory_size(dir) {
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
// If configured paths failed, return 0
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
// No configured path - try to get WorkingDirectory from systemctl
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("WorkingDirectory for {}", service_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("MemoryCurrent=") {
|
||||
let memory_str = line.trim_start_matches("MemoryCurrent=");
|
||||
if let Ok(memory_bytes) = memory_str.parse::<u64>() {
|
||||
return Some(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||
let dir = line.strip_prefix("WorkingDirectory=").unwrap_or("");
|
||||
if !dir.is_empty() && dir != "/" {
|
||||
return Ok(self.get_directory_size(dir).unwrap_or(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
|
||||
Ok(0.0)
|
||||
}
|
||||
|
||||
|
||||
/// Get directory size in GB with permission-aware logging
|
||||
fn get_directory_size(&self, dir: &str) -> Option<f32> {
|
||||
let output = Command::new("sudo").arg("du").arg("-sb").arg(dir).output().ok()?;
|
||||
|
||||
/// Get size of a directory in GB
|
||||
fn get_directory_size(&self, path: &str) -> Option<f32> {
|
||||
let output = Command::new("sudo")
|
||||
.args(&["du", "-sb", path])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
// Log permission errors for debugging but don't spam logs
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if stderr.contains("Permission denied") {
|
||||
debug!("Permission denied accessing directory: {}", dir);
|
||||
debug!("Permission denied accessing directory: {}", path);
|
||||
} else {
|
||||
debug!("Failed to get size for directory {}: {}", dir, stderr);
|
||||
debug!("Failed to get size for directory {}: {}", path, stderr);
|
||||
}
|
||||
return None;
|
||||
}
|
||||
@@ -322,305 +478,98 @@ impl SystemdCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get service disk usage - simple and deterministic
|
||||
fn get_service_disk_usage(&self, service: &str) -> Option<f32> {
|
||||
// 1. Check if service has defined directories
|
||||
let defined_dirs = self.get_service_directories(service);
|
||||
if !defined_dirs.is_empty() {
|
||||
// Service has defined paths - use ONLY those
|
||||
for dir in defined_dirs {
|
||||
if let Some(size) = self.get_directory_size(dir) {
|
||||
return Some(size);
|
||||
}
|
||||
}
|
||||
// If defined path failed, return None (shows as "-")
|
||||
return None;
|
||||
/// Calculate service status, taking user-stopped services into account
|
||||
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||
match active_status.to_lowercase().as_str() {
|
||||
"active" => Status::Ok,
|
||||
"inactive" | "dead" => {
|
||||
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
||||
Status::Inactive
|
||||
},
|
||||
"failed" | "error" => Status::Critical,
|
||||
"activating" | "deactivating" | "reloading" | "starting" | "stopping" => {
|
||||
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
||||
Status::Pending
|
||||
},
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
// 2. No defined path - use systemctl WorkingDirectory
|
||||
/// Get memory usage for a specific service
|
||||
async fn get_service_memory_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
.arg("--property=WorkingDirectory")
|
||||
.args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"])
|
||||
.output()
|
||||
.ok()?;
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("memory usage for {}", service_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||
let dir = line.trim_start_matches("WorkingDirectory=");
|
||||
if !dir.is_empty() && dir != "/" {
|
||||
return self.get_directory_size(dir);
|
||||
if line.starts_with("MemoryCurrent=") {
|
||||
if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") {
|
||||
if mem_str != "[not set]" {
|
||||
if let Ok(memory_bytes) = mem_str.parse::<u64>() {
|
||||
return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
Ok(0.0)
|
||||
}
|
||||
|
||||
/// Get defined service directories (highest priority)
|
||||
fn get_service_directories(&self, service: &str) -> Vec<&str> {
|
||||
match service {
|
||||
// Game servers (ARK Survival Ascended) - HIGHEST PRIORITY
|
||||
"ark-island" => vec!["/var/lib/ark-servers/island"],
|
||||
"ark-scorched" => vec!["/var/lib/ark-servers/scorched"],
|
||||
"ark-center" => vec!["/var/lib/ark-servers/center"],
|
||||
"ark-aberration" => vec!["/var/lib/ark-servers/aberration"],
|
||||
"ark-extinction" => vec!["/var/lib/ark-servers/extinction"],
|
||||
"ark-ragnarok" => vec!["/var/lib/ark-servers/ragnarok"],
|
||||
"ark-valguero" => vec!["/var/lib/ark-servers/valguero"],
|
||||
|
||||
// Other services with defined paths
|
||||
s if s.contains("docker") => vec!["/var/lib/docker", "/var/lib/docker/containers"],
|
||||
s if s.contains("gitea") => vec!["/var/lib/gitea", "/opt/gitea", "/home/git", "/data/gitea"],
|
||||
s if s.contains("nginx") => vec!["/var/log/nginx", "/var/www", "/usr/share/nginx"],
|
||||
s if s.contains("immich") => vec!["/var/lib/immich", "/opt/immich", "/usr/src/app/upload"],
|
||||
s if s.contains("postgres") => vec!["/var/lib/postgresql", "/var/lib/postgres"],
|
||||
s if s.contains("mysql") => vec!["/var/lib/mysql"],
|
||||
s if s.contains("redis") => vec!["/var/lib/redis", "/var/redis"],
|
||||
s if s.contains("unifi") => vec!["/var/lib/unifi", "/opt/UniFi"],
|
||||
s if s.contains("vaultwarden") => vec!["/var/lib/vaultwarden", "/opt/vaultwarden"],
|
||||
s if s.contains("mosquitto") => vec!["/var/lib/mosquitto", "/etc/mosquitto"],
|
||||
s if s.contains("postfix") => vec!["/var/spool/postfix", "/var/lib/postfix"],
|
||||
|
||||
// No defined path - will fall back to systemctl WorkingDirectory
|
||||
_ => vec![],
|
||||
/// Check if service collection cache should be updated
|
||||
fn should_update_cache(&self) -> bool {
|
||||
let state = self.state.read().unwrap();
|
||||
|
||||
match state.last_collection {
|
||||
None => true,
|
||||
Some(last) => {
|
||||
let cache_duration = std::time::Duration::from_secs(30);
|
||||
last.elapsed() > cache_duration
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cached complete service data with sub-services if available and fresh
|
||||
fn get_cached_complete_services(&self) -> Option<Vec<ServiceData>> {
|
||||
if !self.should_update_cache() {
|
||||
let state = self.state.read().unwrap();
|
||||
Some(state.cached_service_data.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SystemdCollector {
|
||||
fn name(&self) -> &str {
|
||||
"systemd"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting systemd services metrics");
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Get cached services (discovery only happens when needed)
|
||||
let monitored_services = match self.get_monitored_services() {
|
||||
Ok(services) => services,
|
||||
Err(e) => {
|
||||
debug!("Failed to get monitored services: {}", e);
|
||||
return Ok(metrics);
|
||||
}
|
||||
};
|
||||
|
||||
// Collect individual metrics for each monitored service (status, memory, disk only)
|
||||
for service in &monitored_services {
|
||||
match self.get_service_status(service) {
|
||||
Ok((active_status, _detailed_info)) => {
|
||||
let status = self.calculate_service_status(&active_status);
|
||||
|
||||
// Individual service status metric
|
||||
metrics.push(Metric {
|
||||
name: format!("service_{}_status", service),
|
||||
value: MetricValue::String(active_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Service {} status", service)),
|
||||
status,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
// Service memory usage (if available)
|
||||
if let Some(memory_mb) = self.get_service_memory(service) {
|
||||
metrics.push(Metric {
|
||||
name: format!("service_{}_memory_mb", service),
|
||||
value: MetricValue::Float(memory_mb),
|
||||
unit: Some("MB".to_string()),
|
||||
description: Some(format!("Service {} memory usage", service)),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
|
||||
// Service disk usage (comprehensive detection)
|
||||
if let Some(disk_gb) = self.get_service_disk_usage(service) {
|
||||
metrics.push(Metric {
|
||||
name: format!("service_{}_disk_gb", service),
|
||||
value: MetricValue::Float(disk_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Service {} disk usage", service)),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
|
||||
// Sub-service metrics for specific services
|
||||
if service.contains("nginx") && active_status == "active" {
|
||||
metrics.extend(self.get_nginx_site_metrics());
|
||||
}
|
||||
|
||||
if service.contains("docker") && active_status == "active" {
|
||||
metrics.extend(self.get_docker_containers());
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get status for service {}: {}", service, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let collection_time = start_time.elapsed();
|
||||
debug!(
|
||||
"Systemd collection completed in {:?} with {} individual service metrics",
|
||||
collection_time,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
||||
None // Performance tracking handled by cache system
|
||||
}
|
||||
}
|
||||
|
||||
impl SystemdCollector {
|
||||
/// Get nginx sites with latency checks
|
||||
fn get_nginx_sites(&self) -> Vec<Metric> {
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
/// Get nginx sites with latency checks (internal - no caching)
|
||||
fn get_nginx_sites_internal(&self) -> Vec<(String, f32)> {
|
||||
let mut sites = Vec::new();
|
||||
|
||||
// Discover nginx sites from configuration
|
||||
let sites = self.discover_nginx_sites();
|
||||
let discovered_sites = self.discover_nginx_sites();
|
||||
|
||||
for (site_name, url) in &sites {
|
||||
// Always add all discovered sites, even if checks fail (like old version)
|
||||
for (site_name, url) in &discovered_sites {
|
||||
match self.check_site_latency(url) {
|
||||
Ok(latency_ms) => {
|
||||
let status = if latency_ms < 500.0 {
|
||||
Status::Ok
|
||||
} else if latency_ms < 2000.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Critical
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("service_nginx_{}_latency_ms", site_name),
|
||||
value: MetricValue::Float(latency_ms),
|
||||
unit: Some("ms".to_string()),
|
||||
description: Some(format!("Response time for {}", url)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
sites.push((site_name.clone(), latency_ms));
|
||||
}
|
||||
Err(_) => {
|
||||
// Site is unreachable
|
||||
metrics.push(Metric {
|
||||
name: format!("service_nginx_{}_latency_ms", site_name),
|
||||
value: MetricValue::Float(-1.0), // Use -1 to indicate error
|
||||
unit: Some("ms".to_string()),
|
||||
description: Some(format!("Response time for {} (unreachable)", url)),
|
||||
status: Status::Critical,
|
||||
timestamp,
|
||||
});
|
||||
// Site is unreachable - use -1.0 to indicate error (like old version)
|
||||
sites.push((site_name.clone(), -1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metrics
|
||||
sites
|
||||
}
|
||||
|
||||
/// Get docker containers as sub-services
|
||||
fn get_docker_containers(&self) -> Vec<Metric> {
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Check if docker is available
|
||||
let output = Command::new("docker")
|
||||
.arg("ps")
|
||||
.arg("--format")
|
||||
.arg("{{.Names}},{{.Status}}")
|
||||
.output();
|
||||
|
||||
let output = match output {
|
||||
Ok(out) if out.status.success() => out,
|
||||
_ => return metrics, // Docker not available or failed
|
||||
};
|
||||
|
||||
let output_str = match String::from_utf8(output.stdout) {
|
||||
Ok(s) => s,
|
||||
Err(_) => return metrics,
|
||||
};
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = line.split(',').collect();
|
||||
if parts.len() >= 2 {
|
||||
let container_name = parts[0].trim();
|
||||
let status_str = parts[1].trim();
|
||||
|
||||
let status = if status_str.contains("Up") {
|
||||
Status::Ok
|
||||
} else if status_str.contains("Exited") {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Critical
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("service_docker_{}_status", container_name),
|
||||
value: MetricValue::String(status_str.to_string()),
|
||||
unit: None,
|
||||
description: Some(format!("Docker container {} status", container_name)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
metrics
|
||||
}
|
||||
|
||||
/// Check site latency using HTTP GET requests
|
||||
fn check_site_latency(&self, url: &str) -> Result<f32, Box<dyn std::error::Error>> {
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Create HTTP client with timeouts (similar to legacy implementation)
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(Duration::from_secs(5))
|
||||
.connect_timeout(Duration::from_secs(2))
|
||||
.redirect(reqwest::redirect::Policy::limited(10))
|
||||
.build()?;
|
||||
|
||||
// Make GET request and measure latency
|
||||
let response = client.get(url).send()?;
|
||||
let latency = start.elapsed().as_millis() as f32;
|
||||
|
||||
// Check if response is successful (2xx or 3xx status codes)
|
||||
if response.status().is_success() || response.status().is_redirection() {
|
||||
Ok(latency)
|
||||
} else {
|
||||
Err(format!(
|
||||
"HTTP request failed for {} with status: {}",
|
||||
url,
|
||||
response.status()
|
||||
)
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// Discover nginx sites from configuration files (like the old working implementation)
|
||||
/// Discover nginx sites from configuration
|
||||
fn discover_nginx_sites(&self) -> Vec<(String, String)> {
|
||||
use tracing::debug;
|
||||
|
||||
// Use the same approach as the old working agent: get nginx config from systemd
|
||||
let config_content = match self.get_nginx_config_from_systemd() {
|
||||
Some(content) => content,
|
||||
@@ -640,12 +589,25 @@ impl SystemdCollector {
|
||||
self.parse_nginx_config_for_sites(&config_content)
|
||||
}
|
||||
|
||||
/// Fallback: get nginx config via nginx -T command
|
||||
fn get_nginx_config_via_command(&self) -> Option<String> {
|
||||
let output = Command::new("nginx")
|
||||
.args(&["-T"])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
debug!("nginx -T failed");
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(String::from_utf8_lossy(&output.stdout).to_string())
|
||||
}
|
||||
|
||||
/// Get nginx config from systemd service definition (NixOS compatible)
|
||||
fn get_nginx_config_from_systemd(&self) -> Option<String> {
|
||||
use tracing::debug;
|
||||
|
||||
let output = std::process::Command::new("systemctl")
|
||||
.args(["show", "nginx", "--property=ExecStart", "--no-pager"])
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", "nginx", "--property=ExecStart", "--no-pager"])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
@@ -661,13 +623,9 @@ impl SystemdCollector {
|
||||
for line in stdout.lines() {
|
||||
if line.starts_with("ExecStart=") {
|
||||
debug!("Found ExecStart line: {}", line);
|
||||
// Handle both traditional and NixOS systemd formats
|
||||
if let Some(config_path) = self.extract_config_path_from_exec_start(line) {
|
||||
debug!("Extracted config path: {}", config_path);
|
||||
// Read the config file
|
||||
return std::fs::read_to_string(&config_path)
|
||||
.map_err(|e| debug!("Failed to read config file {}: {}", config_path, e))
|
||||
.ok();
|
||||
return std::fs::read_to_string(&config_path).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -677,8 +635,6 @@ impl SystemdCollector {
|
||||
|
||||
/// Extract config path from ExecStart line
|
||||
fn extract_config_path_from_exec_start(&self, exec_start: &str) -> Option<String> {
|
||||
use tracing::debug;
|
||||
|
||||
// Remove ExecStart= prefix
|
||||
let exec_part = exec_start.strip_prefix("ExecStart=")?;
|
||||
debug!("Parsing exec part: {}", exec_part);
|
||||
@@ -710,26 +666,8 @@ impl SystemdCollector {
|
||||
None
|
||||
}
|
||||
|
||||
/// Fallback: get nginx config via nginx -T command
|
||||
fn get_nginx_config_via_command(&self) -> Option<String> {
|
||||
use tracing::debug;
|
||||
|
||||
let output = std::process::Command::new("nginx")
|
||||
.args(["-T"])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
debug!("nginx -T failed");
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(String::from_utf8_lossy(&output.stdout).to_string())
|
||||
}
|
||||
|
||||
/// Parse nginx config content to extract server names and build site list
|
||||
fn parse_nginx_config_for_sites(&self, config_content: &str) -> Vec<(String, String)> {
|
||||
use tracing::debug;
|
||||
let mut sites = Vec::new();
|
||||
let lines: Vec<&str> = config_content.lines().collect();
|
||||
let mut i = 0;
|
||||
@@ -739,11 +677,8 @@ impl SystemdCollector {
|
||||
while i < lines.len() {
|
||||
let line = lines[i].trim();
|
||||
if line.starts_with("server") && line.contains("{") {
|
||||
debug!("Found server block at line {}", i);
|
||||
if let Some(server_name) = self.parse_server_block(&lines, &mut i) {
|
||||
debug!("Extracted server name: {}", server_name);
|
||||
let url = format!("https://{}", server_name);
|
||||
// Use the full domain as the site name for clarity
|
||||
sites.push((server_name.clone(), url));
|
||||
}
|
||||
}
|
||||
@@ -756,7 +691,6 @@ impl SystemdCollector {
|
||||
|
||||
/// Parse a server block to extract the primary server_name
|
||||
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
|
||||
use tracing::debug;
|
||||
let mut server_names = Vec::new();
|
||||
let mut has_redirect = false;
|
||||
let mut i = *start_index + 1;
|
||||
@@ -797,11 +731,151 @@ impl SystemdCollector {
|
||||
|
||||
*start_index = i - 1;
|
||||
|
||||
// Only return hostnames that are not redirects and have actual content
|
||||
if !server_names.is_empty() && !has_redirect {
|
||||
Some(server_names[0].clone())
|
||||
return Some(server_names[0].clone());
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Check site latency using HTTP GET requests
|
||||
fn check_site_latency(&self, url: &str) -> Result<f32, Box<dyn std::error::Error>> {
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Create HTTP client with timeouts from configuration
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(Duration::from_secs(self.config.http_timeout_seconds))
|
||||
.connect_timeout(Duration::from_secs(self.config.http_connect_timeout_seconds))
|
||||
.redirect(reqwest::redirect::Policy::limited(10))
|
||||
.build()?;
|
||||
|
||||
// Make GET request and measure latency
|
||||
let response = client.get(url).send()?;
|
||||
let latency = start.elapsed().as_millis() as f32;
|
||||
|
||||
// Check if response is successful (2xx or 3xx status codes)
|
||||
if response.status().is_success() || response.status().is_redirection() {
|
||||
Ok(latency)
|
||||
} else {
|
||||
None
|
||||
Err(format!(
|
||||
"HTTP request failed for {} with status: {}",
|
||||
url,
|
||||
response.status()
|
||||
)
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// Get docker containers as sub-services
|
||||
fn get_docker_containers(&self) -> Vec<(String, String)> {
|
||||
let mut containers = Vec::new();
|
||||
|
||||
// Check if docker is available (cm-agent user is in docker group)
|
||||
// Use -a to show ALL containers (running and stopped)
|
||||
let output = Command::new("docker")
|
||||
.args(&["ps", "-a", "--format", "{{.Names}},{{.Status}}"])
|
||||
.output();
|
||||
|
||||
let output = match output {
|
||||
Ok(out) if out.status.success() => out,
|
||||
_ => return containers, // Docker not available or failed
|
||||
};
|
||||
|
||||
let output_str = match String::from_utf8(output.stdout) {
|
||||
Ok(s) => s,
|
||||
Err(_) => return containers,
|
||||
};
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = line.split(',').collect();
|
||||
if parts.len() >= 2 {
|
||||
let container_name = parts[0].trim();
|
||||
let status_str = parts[1].trim();
|
||||
|
||||
let container_status = if status_str.contains("Up") {
|
||||
"active"
|
||||
} else if status_str.contains("Exited") || status_str.contains("Created") {
|
||||
"inactive" // Stopped/created containers are inactive
|
||||
} else {
|
||||
"failed" // Other states (restarting, paused, dead) → failed
|
||||
};
|
||||
|
||||
containers.push((format!("docker_{}", container_name), container_status.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
containers
|
||||
}
|
||||
|
||||
/// Get docker images as sub-services
|
||||
fn get_docker_images(&self) -> Vec<(String, String, String)> {
|
||||
let mut images = Vec::new();
|
||||
// Check if docker is available (cm-agent user is in docker group)
|
||||
let output = Command::new("docker")
|
||||
.args(&["images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"])
|
||||
.output();
|
||||
|
||||
let output = match output {
|
||||
Ok(out) if out.status.success() => out,
|
||||
Ok(_) => {
|
||||
return images;
|
||||
}
|
||||
Err(_) => {
|
||||
return images;
|
||||
}
|
||||
};
|
||||
|
||||
let output_str = match String::from_utf8(output.stdout) {
|
||||
Ok(s) => s,
|
||||
Err(_) => return images,
|
||||
};
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = line.split(',').collect();
|
||||
if parts.len() >= 2 {
|
||||
let image_name = parts[0].trim();
|
||||
let size = parts[1].trim();
|
||||
|
||||
// Skip <none>:<none> images (dangling images)
|
||||
if image_name.contains("<none>") {
|
||||
continue;
|
||||
}
|
||||
|
||||
images.push((
|
||||
format!("image_{}", image_name),
|
||||
"active".to_string(), // Images are always "active" (present)
|
||||
size.to_string()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
images
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SystemdCollector {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Use cached complete data if available and fresh
|
||||
if let Some(cached_complete_services) = self.get_cached_complete_services() {
|
||||
for service_data in cached_complete_services {
|
||||
agent_data.services.push(service_data);
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
// Collect fresh data
|
||||
self.collect_service_data(agent_data).await
|
||||
}
|
||||
}
|
||||
}
|
||||
403
agent/src/collectors/systemd_old.rs
Normal file
403
agent/src/collectors/systemd_old.rs
Normal file
@@ -0,0 +1,403 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{AgentData, ServiceData, Status};
|
||||
use std::process::Command;
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::SystemdConfig;
|
||||
|
||||
/// Systemd collector for monitoring systemd services with structured data output
|
||||
pub struct SystemdCollector {
|
||||
/// Cached state with thread-safe interior mutability
|
||||
state: RwLock<ServiceCacheState>,
|
||||
/// Configuration for service monitoring
|
||||
config: SystemdConfig,
|
||||
}
|
||||
|
||||
/// Internal state for service caching
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceCacheState {
|
||||
/// Last collection time for performance tracking
|
||||
last_collection: Option<Instant>,
|
||||
/// Cached service data
|
||||
services: Vec<ServiceInfo>,
|
||||
/// Interesting services to monitor (cached after discovery)
|
||||
monitored_services: Vec<String>,
|
||||
/// Cached service status information from discovery
|
||||
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||
/// Last time services were discovered
|
||||
last_discovery_time: Option<Instant>,
|
||||
/// How often to rediscover services (from config)
|
||||
discovery_interval_seconds: u64,
|
||||
}
|
||||
|
||||
/// Cached service status information from systemctl list-units
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceStatusInfo {
|
||||
load_state: String,
|
||||
active_state: String,
|
||||
sub_state: String,
|
||||
}
|
||||
|
||||
/// Internal service information
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceInfo {
|
||||
name: String,
|
||||
status: String, // "active", "inactive", "failed", etc.
|
||||
memory_mb: f32, // Memory usage in MB
|
||||
disk_gb: f32, // Disk usage in GB (usually 0 for services)
|
||||
}
|
||||
|
||||
impl SystemdCollector {
|
||||
pub fn new(config: SystemdConfig) -> Self {
|
||||
let state = ServiceCacheState {
|
||||
last_collection: None,
|
||||
services: Vec::new(),
|
||||
monitored_services: Vec::new(),
|
||||
service_status_cache: std::collections::HashMap::new(),
|
||||
last_discovery_time: None,
|
||||
discovery_interval_seconds: config.interval_seconds,
|
||||
};
|
||||
|
||||
Self {
|
||||
state: RwLock::new(state),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect service data and populate AgentData
|
||||
async fn collect_service_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting systemd services metrics");
|
||||
|
||||
// Get cached services (discovery only happens when needed)
|
||||
let monitored_services = match self.get_monitored_services() {
|
||||
Ok(services) => services,
|
||||
Err(e) => {
|
||||
debug!("Failed to get monitored services: {}", e);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
// Collect service data for each monitored service
|
||||
let mut services = Vec::new();
|
||||
for service_name in &monitored_services {
|
||||
match self.get_service_status(service_name) {
|
||||
Ok((active_status, _detailed_info)) => {
|
||||
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
||||
|
||||
let service_info = ServiceInfo {
|
||||
name: service_name.clone(),
|
||||
status: active_status,
|
||||
memory_mb,
|
||||
disk_gb,
|
||||
};
|
||||
services.push(service_info);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get status for service {}: {}", service_name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update cached state
|
||||
{
|
||||
let mut state = self.state.write().unwrap();
|
||||
state.last_collection = Some(start_time);
|
||||
state.services = services.clone();
|
||||
}
|
||||
|
||||
// Populate AgentData with service information
|
||||
for service in services {
|
||||
agent_data.services.push(ServiceData {
|
||||
name: service.name.clone(),
|
||||
status: service.status.clone(),
|
||||
memory_mb: service.memory_mb,
|
||||
disk_gb: service.disk_gb,
|
||||
user_stopped: false, // TODO: Integrate with service tracker
|
||||
service_status: self.calculate_service_status(&service.name, &service.status),
|
||||
});
|
||||
}
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!("Systemd collection completed in {:?} with {} services", elapsed, agent_data.services.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get systemd services information
|
||||
async fn get_systemd_services(&self) -> Result<Vec<ServiceInfo>, CollectorError> {
|
||||
let mut services = Vec::new();
|
||||
|
||||
// Get ALL service unit files (includes inactive services)
|
||||
let unit_files_output = Command::new("systemctl")
|
||||
.args(&["list-unit-files", "--type=service", "--no-pager", "--plain"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "systemctl list-unit-files".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
// Get runtime status of ALL units (including inactive)
|
||||
let status_output = Command::new("systemctl")
|
||||
.args(&["list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "systemctl list-units --all".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let unit_files_str = String::from_utf8_lossy(&unit_files_output.stdout);
|
||||
let status_str = String::from_utf8_lossy(&status_output.stdout);
|
||||
|
||||
// Parse all service unit files to get complete service list
|
||||
let mut all_service_names = std::collections::HashSet::new();
|
||||
for line in unit_files_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
all_service_names.insert(service_name.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Parse runtime status for all units
|
||||
let mut status_cache = std::collections::HashMap::new();
|
||||
for line in status_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||
status_cache.insert(service_name.to_string(), (load_state, active_state, sub_state));
|
||||
}
|
||||
}
|
||||
|
||||
// For services found in unit files but not in runtime status, set default inactive status
|
||||
for service_name in &all_service_names {
|
||||
if !status_cache.contains_key(service_name) {
|
||||
status_cache.insert(service_name.to_string(), (
|
||||
"not-loaded".to_string(),
|
||||
"inactive".to_string(),
|
||||
"dead".to_string()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Process all discovered services and apply filters
|
||||
for service_name in &all_service_names {
|
||||
if self.should_monitor_service(service_name) {
|
||||
if let Some((load_state, active_state, sub_state)) = status_cache.get(service_name) {
|
||||
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||
let disk_gb = self.get_service_disk_usage(service_name).await.unwrap_or(0.0);
|
||||
|
||||
let normalized_status = self.normalize_service_status(active_state, sub_state);
|
||||
let service_info = ServiceInfo {
|
||||
name: service_name.to_string(),
|
||||
status: normalized_status,
|
||||
memory_mb,
|
||||
disk_gb,
|
||||
};
|
||||
|
||||
services.push(service_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(services)
|
||||
}
|
||||
|
||||
/// Check if a service should be monitored based on configuration filters with wildcard support
|
||||
fn should_monitor_service(&self, service_name: &str) -> bool {
|
||||
// If no filters configured, monitor nothing (to prevent noise)
|
||||
if self.config.service_name_filters.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if service matches any of the configured patterns
|
||||
for pattern in &self.config.service_name_filters {
|
||||
if self.matches_pattern(service_name, pattern) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if service name matches pattern (supports wildcards like nginx*)
|
||||
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
||||
if pattern.ends_with('*') {
|
||||
let prefix = &pattern[..pattern.len() - 1];
|
||||
service_name.starts_with(prefix)
|
||||
} else {
|
||||
service_name == pattern
|
||||
}
|
||||
}
|
||||
|
||||
/// Get disk usage for a specific service
|
||||
async fn get_service_disk_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||
// Check if this service has configured directory paths
|
||||
if let Some(dirs) = self.config.service_directories.get(service_name) {
|
||||
// Service has configured paths - use the first accessible one
|
||||
for dir in dirs {
|
||||
if let Some(size) = self.get_directory_size(dir) {
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
// If configured paths failed, return 0
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
// No configured path - try to get WorkingDirectory from systemctl
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("WorkingDirectory for {}", service_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||
let dir = line.strip_prefix("WorkingDirectory=").unwrap_or("");
|
||||
if !dir.is_empty() {
|
||||
return Ok(self.get_directory_size(dir).unwrap_or(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0.0)
|
||||
}
|
||||
|
||||
/// Get size of a directory in GB
|
||||
fn get_directory_size(&self, path: &str) -> Option<f32> {
|
||||
let output = Command::new("du")
|
||||
.args(&["-sb", path])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let parts: Vec<&str> = output_str.split_whitespace().collect();
|
||||
if let Some(size_str) = parts.first() {
|
||||
if let Ok(size_bytes) = size_str.parse::<u64>() {
|
||||
return Some(size_bytes as f32 / (1024.0 * 1024.0 * 1024.0));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Calculate service status, taking user-stopped services into account
|
||||
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||
match active_status.to_lowercase().as_str() {
|
||||
"active" => Status::Ok,
|
||||
"inactive" | "dead" => {
|
||||
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
||||
Status::Inactive
|
||||
},
|
||||
"failed" | "error" => Status::Critical,
|
||||
"activating" | "deactivating" | "reloading" | "starting" | "stopping" => {
|
||||
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
||||
Status::Pending
|
||||
},
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get memory usage for a specific service
|
||||
async fn get_service_memory_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("memory usage for {}", service_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("MemoryCurrent=") {
|
||||
if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") {
|
||||
if mem_str != "[not set]" {
|
||||
if let Ok(memory_bytes) = mem_str.parse::<u64>() {
|
||||
return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0.0)
|
||||
}
|
||||
|
||||
/// Normalize service status to standard values
|
||||
fn normalize_service_status(&self, active_state: &str, sub_state: &str) -> String {
|
||||
match (active_state, sub_state) {
|
||||
("active", "running") => "active".to_string(),
|
||||
("active", _) => "active".to_string(),
|
||||
("inactive", "dead") => "inactive".to_string(),
|
||||
("inactive", _) => "inactive".to_string(),
|
||||
("failed", _) => "failed".to_string(),
|
||||
("activating", _) => "starting".to_string(),
|
||||
("deactivating", _) => "stopping".to_string(),
|
||||
_ => format!("{}:{}", active_state, sub_state),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if service collection cache should be updated
|
||||
fn should_update_cache(&self) -> bool {
|
||||
let state = self.state.read().unwrap();
|
||||
|
||||
match state.last_collection {
|
||||
None => true,
|
||||
Some(last) => {
|
||||
let cache_duration = std::time::Duration::from_secs(30);
|
||||
last.elapsed() > cache_duration
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cached service data if available and fresh
|
||||
fn get_cached_services(&self) -> Option<Vec<ServiceInfo>> {
|
||||
if !self.should_update_cache() {
|
||||
let state = self.state.read().unwrap();
|
||||
Some(state.services.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SystemdCollector {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Use cached data if available and fresh
|
||||
if let Some(cached_services) = self.get_cached_services() {
|
||||
debug!("Using cached systemd services data");
|
||||
for service in cached_services {
|
||||
agent_data.services.push(ServiceData {
|
||||
name: service.name.clone(),
|
||||
status: service.status.clone(),
|
||||
memory_mb: service.memory_mb,
|
||||
disk_gb: service.disk_gb,
|
||||
user_stopped: false, // TODO: Integrate with service tracker
|
||||
service_status: self.calculate_service_status(&service.name, &service.status),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
// Collect fresh data
|
||||
self.collect_service_data(agent_data).await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::{MetricMessage, MessageEnvelope};
|
||||
use tracing::{info, debug};
|
||||
use cm_dashboard_shared::{AgentData, MessageEnvelope};
|
||||
use tracing::{debug, info};
|
||||
use zmq::{Context, Socket, SocketType};
|
||||
|
||||
use crate::config::ZmqConfig;
|
||||
@@ -9,81 +9,71 @@ use crate::config::ZmqConfig;
|
||||
pub struct ZmqHandler {
|
||||
publisher: Socket,
|
||||
command_receiver: Socket,
|
||||
config: ZmqConfig,
|
||||
}
|
||||
|
||||
impl ZmqHandler {
|
||||
pub async fn new(config: &ZmqConfig) -> Result<Self> {
|
||||
let context = Context::new();
|
||||
|
||||
|
||||
// Create publisher socket for metrics
|
||||
let publisher = context.socket(SocketType::PUB)?;
|
||||
let pub_bind_address = format!("tcp://{}:{}", config.bind_address, config.publisher_port);
|
||||
publisher.bind(&pub_bind_address)?;
|
||||
|
||||
|
||||
info!("ZMQ publisher bound to {}", pub_bind_address);
|
||||
|
||||
|
||||
// Set socket options for efficiency
|
||||
publisher.set_sndhwm(1000)?; // High water mark for outbound messages
|
||||
publisher.set_linger(1000)?; // Linger time on close
|
||||
|
||||
|
||||
// Create command receiver socket (PULL socket to receive commands from dashboard)
|
||||
let command_receiver = context.socket(SocketType::PULL)?;
|
||||
let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port);
|
||||
command_receiver.bind(&cmd_bind_address)?;
|
||||
|
||||
|
||||
info!("ZMQ command receiver bound to {}", cmd_bind_address);
|
||||
|
||||
|
||||
// Set non-blocking mode for command receiver
|
||||
command_receiver.set_rcvtimeo(0)?; // Non-blocking receive
|
||||
command_receiver.set_linger(1000)?;
|
||||
|
||||
|
||||
Ok(Self {
|
||||
publisher,
|
||||
command_receiver,
|
||||
config: config.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Publish metrics message via ZMQ
|
||||
pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
|
||||
debug!("Publishing {} metrics for host {}", message.metrics.len(), message.hostname);
|
||||
|
||||
// Create message envelope
|
||||
let envelope = MessageEnvelope::metrics(message.clone())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to create message envelope: {}", e))?;
|
||||
|
||||
|
||||
|
||||
/// Publish agent data via ZMQ
|
||||
pub async fn publish_agent_data(&self, data: &AgentData) -> Result<()> {
|
||||
debug!(
|
||||
"Publishing agent data for host {}",
|
||||
data.hostname
|
||||
);
|
||||
|
||||
// Create message envelope for agent data
|
||||
let envelope = MessageEnvelope::agent_data(data.clone())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to create agent data envelope: {}", e))?;
|
||||
|
||||
// Serialize envelope
|
||||
let serialized = serde_json::to_vec(&envelope)?;
|
||||
|
||||
|
||||
// Send via ZMQ
|
||||
self.publisher.send(&serialized, 0)?;
|
||||
|
||||
debug!("Published metrics message ({} bytes)", serialized.len());
|
||||
|
||||
debug!("Published agent data message ({} bytes)", serialized.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send heartbeat (placeholder for future use)
|
||||
pub async fn send_heartbeat(&self) -> Result<()> {
|
||||
let envelope = MessageEnvelope::heartbeat()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to create heartbeat envelope: {}", e))?;
|
||||
|
||||
let serialized = serde_json::to_vec(&envelope)?;
|
||||
self.publisher.send(&serialized, 0)?;
|
||||
|
||||
debug!("Sent heartbeat");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Try to receive a command (non-blocking)
|
||||
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
||||
match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
|
||||
Ok(bytes) => {
|
||||
debug!("Received command message ({} bytes)", bytes.len());
|
||||
|
||||
|
||||
let command: AgentCommand = serde_json::from_slice(&bytes)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to deserialize command: {}", e))?;
|
||||
|
||||
|
||||
debug!("Parsed command: {:?}", command);
|
||||
Ok(Some(command))
|
||||
}
|
||||
@@ -107,4 +97,4 @@ pub enum AgentCommand {
|
||||
ToggleCollector { name: String, enabled: bool },
|
||||
/// Request status/health check
|
||||
Ping,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
// Collection intervals
|
||||
pub const DEFAULT_COLLECTION_INTERVAL_SECONDS: u64 = 2;
|
||||
pub const DEFAULT_CPU_INTERVAL_SECONDS: u64 = 5;
|
||||
pub const DEFAULT_MEMORY_INTERVAL_SECONDS: u64 = 5;
|
||||
pub const DEFAULT_DISK_INTERVAL_SECONDS: u64 = 300; // 5 minutes
|
||||
pub const DEFAULT_PROCESS_INTERVAL_SECONDS: u64 = 30;
|
||||
pub const DEFAULT_SYSTEMD_INTERVAL_SECONDS: u64 = 30;
|
||||
pub const DEFAULT_SMART_INTERVAL_SECONDS: u64 = 900; // 15 minutes
|
||||
pub const DEFAULT_BACKUP_INTERVAL_SECONDS: u64 = 900; // 15 minutes
|
||||
pub const DEFAULT_NETWORK_INTERVAL_SECONDS: u64 = 30;
|
||||
|
||||
// ZMQ configuration
|
||||
pub const DEFAULT_ZMQ_PUBLISHER_PORT: u16 = 6130;
|
||||
pub const DEFAULT_ZMQ_COMMAND_PORT: u16 = 6131;
|
||||
pub const DEFAULT_ZMQ_BIND_ADDRESS: &str = "0.0.0.0";
|
||||
pub const DEFAULT_ZMQ_TIMEOUT_MS: u64 = 5000;
|
||||
pub const DEFAULT_ZMQ_HEARTBEAT_INTERVAL_MS: u64 = 30000;
|
||||
|
||||
// CPU thresholds (production values from legacy)
|
||||
pub const DEFAULT_CPU_LOAD_WARNING: f32 = 9.0;
|
||||
pub const DEFAULT_CPU_LOAD_CRITICAL: f32 = 10.0;
|
||||
pub const DEFAULT_CPU_TEMP_WARNING: f32 = 100.0; // Effectively disabled
|
||||
pub const DEFAULT_CPU_TEMP_CRITICAL: f32 = 100.0; // Effectively disabled
|
||||
|
||||
// Memory thresholds (from legacy)
|
||||
pub const DEFAULT_MEMORY_WARNING_PERCENT: f32 = 80.0;
|
||||
pub const DEFAULT_MEMORY_CRITICAL_PERCENT: f32 = 95.0;
|
||||
|
||||
// Disk thresholds
|
||||
pub const DEFAULT_DISK_WARNING_PERCENT: f32 = 80.0;
|
||||
pub const DEFAULT_DISK_CRITICAL_PERCENT: f32 = 90.0;
|
||||
|
||||
// Process configuration
|
||||
pub const DEFAULT_TOP_PROCESSES_COUNT: usize = 10;
|
||||
|
||||
// Service thresholds
|
||||
pub const DEFAULT_SERVICE_MEMORY_WARNING_MB: f32 = 1000.0;
|
||||
pub const DEFAULT_SERVICE_MEMORY_CRITICAL_MB: f32 = 2000.0;
|
||||
|
||||
// SMART thresholds
|
||||
pub const DEFAULT_SMART_TEMP_WARNING: f32 = 60.0;
|
||||
pub const DEFAULT_SMART_TEMP_CRITICAL: f32 = 70.0;
|
||||
pub const DEFAULT_SMART_WEAR_WARNING: f32 = 80.0;
|
||||
pub const DEFAULT_SMART_WEAR_CRITICAL: f32 = 90.0;
|
||||
|
||||
// Backup configuration
|
||||
pub const DEFAULT_BACKUP_MAX_AGE_HOURS: u64 = 48;
|
||||
|
||||
|
||||
// Notification configuration (from legacy)
|
||||
pub const DEFAULT_SMTP_HOST: &str = "localhost";
|
||||
pub const DEFAULT_SMTP_PORT: u16 = 25;
|
||||
pub const DEFAULT_FROM_EMAIL: &str = "{hostname}@cmtec.se";
|
||||
pub const DEFAULT_TO_EMAIL: &str = "cm@cmtec.se";
|
||||
pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 30;
|
||||
@@ -1,18 +1,19 @@
|
||||
use anyhow::{Context, Result};
|
||||
use std::path::Path;
|
||||
use std::fs;
|
||||
use crate::config::AgentConfig;
|
||||
use anyhow::{Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
|
||||
let path = path.as_ref();
|
||||
let content = fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to read config file: {}", path.display()))?;
|
||||
|
||||
|
||||
let config: AgentConfig = toml::from_str(&content)
|
||||
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
|
||||
|
||||
config.validate()
|
||||
|
||||
config
|
||||
.validate()
|
||||
.with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
|
||||
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,11 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::CacheConfig;
|
||||
use gethostname::gethostname;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
pub mod defaults;
|
||||
pub mod loader;
|
||||
pub mod validation;
|
||||
|
||||
use defaults::*;
|
||||
|
||||
/// Main agent configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AgentConfig {
|
||||
@@ -26,8 +22,10 @@ pub struct ZmqConfig {
|
||||
pub publisher_port: u16,
|
||||
pub command_port: u16,
|
||||
pub bind_address: String,
|
||||
pub timeout_ms: u64,
|
||||
pub heartbeat_interval_ms: u64,
|
||||
pub transmission_interval_seconds: u64,
|
||||
/// Heartbeat transmission interval in seconds for host connectivity detection
|
||||
#[serde(default = "default_heartbeat_interval_seconds")]
|
||||
pub heartbeat_interval_seconds: u64,
|
||||
}
|
||||
|
||||
/// Collector configuration
|
||||
@@ -36,11 +34,10 @@ pub struct CollectorConfig {
|
||||
pub cpu: CpuConfig,
|
||||
pub memory: MemoryConfig,
|
||||
pub disk: DiskConfig,
|
||||
pub processes: ProcessConfig,
|
||||
pub systemd: SystemdConfig,
|
||||
pub smart: SmartConfig,
|
||||
pub backup: BackupConfig,
|
||||
pub network: NetworkConfig,
|
||||
pub nixos: NixOSConfig,
|
||||
}
|
||||
|
||||
/// CPU collector configuration
|
||||
@@ -59,7 +56,9 @@ pub struct CpuConfig {
|
||||
pub struct MemoryConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
/// Memory usage warning threshold (percentage)
|
||||
pub usage_warning_percent: f32,
|
||||
/// Memory usage critical threshold (percentage)
|
||||
pub usage_critical_percent: f32,
|
||||
}
|
||||
|
||||
@@ -68,49 +67,55 @@ pub struct MemoryConfig {
|
||||
pub struct DiskConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
/// Disk usage warning threshold (percentage)
|
||||
pub usage_warning_percent: f32,
|
||||
/// Disk usage critical threshold (percentage)
|
||||
pub usage_critical_percent: f32,
|
||||
/// Filesystem configurations (optional - auto-discovery used if empty)
|
||||
#[serde(default)]
|
||||
pub filesystems: Vec<FilesystemConfig>,
|
||||
/// SMART monitoring thresholds
|
||||
pub temperature_warning_celsius: f32,
|
||||
pub temperature_critical_celsius: f32,
|
||||
pub wear_warning_percent: f32,
|
||||
pub wear_critical_percent: f32,
|
||||
}
|
||||
|
||||
/// Filesystem configuration entry
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FilesystemConfig {
|
||||
pub name: String, // Human-readable name (e.g., "root", "boot", "home")
|
||||
pub uuid: String, // UUID for /dev/disk/by-uuid/ resolution
|
||||
pub mount_point: String, // Expected mount point (e.g., "/", "/boot")
|
||||
pub fs_type: String, // Filesystem type (e.g., "ext4", "vfat")
|
||||
pub monitor: bool, // Whether to monitor this filesystem
|
||||
pub name: String,
|
||||
pub uuid: String,
|
||||
pub mount_point: String,
|
||||
pub fs_type: String, // "ext4", "zfs", "xfs", "mergerfs", "btrfs"
|
||||
pub monitor: bool,
|
||||
pub storage_type: String, // "single", "raid", "mirror", "mergerfs", "zfs"
|
||||
}
|
||||
|
||||
/// Process collector configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProcessConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
pub top_processes_count: usize,
|
||||
}
|
||||
|
||||
/// Systemd services collector configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SystemdConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
pub auto_discover: bool,
|
||||
pub services: Vec<String>,
|
||||
pub service_name_filters: Vec<String>,
|
||||
pub excluded_services: Vec<String>,
|
||||
pub memory_warning_mb: f32,
|
||||
pub memory_critical_mb: f32,
|
||||
pub service_directories: std::collections::HashMap<String, Vec<String>>,
|
||||
pub host_user_mapping: String,
|
||||
pub nginx_check_interval_seconds: u64,
|
||||
pub http_timeout_seconds: u64,
|
||||
pub http_connect_timeout_seconds: u64,
|
||||
pub nginx_latency_critical_ms: f32,
|
||||
}
|
||||
|
||||
/// SMART collector configuration
|
||||
|
||||
/// NixOS collector configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SmartConfig {
|
||||
pub struct NixOSConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
pub temperature_warning_celsius: f32,
|
||||
pub temperature_critical_celsius: f32,
|
||||
pub wear_warning_percent: f32,
|
||||
pub wear_critical_percent: f32,
|
||||
}
|
||||
|
||||
/// Backup collector configuration
|
||||
@@ -127,8 +132,6 @@ pub struct BackupConfig {
|
||||
pub struct NetworkConfig {
|
||||
pub enabled: bool,
|
||||
pub interval_seconds: u64,
|
||||
pub interfaces: Vec<String>,
|
||||
pub auto_discover: bool,
|
||||
}
|
||||
|
||||
/// Notification configuration
|
||||
@@ -140,278 +143,31 @@ pub struct NotificationConfig {
|
||||
pub from_email: String,
|
||||
pub to_email: String,
|
||||
pub rate_limit_minutes: u64,
|
||||
/// Email notification batching interval in seconds (default: 60)
|
||||
pub aggregation_interval_seconds: u64,
|
||||
/// List of metric names to exclude from email notifications
|
||||
#[serde(default)]
|
||||
pub exclude_email_metrics: Vec<String>,
|
||||
/// Path to maintenance mode file that suppresses email notifications when present
|
||||
#[serde(default = "default_maintenance_mode_file")]
|
||||
pub maintenance_mode_file: String,
|
||||
}
|
||||
|
||||
|
||||
fn default_heartbeat_interval_seconds() -> u64 {
|
||||
5
|
||||
}
|
||||
|
||||
fn default_maintenance_mode_file() -> String {
|
||||
"/tmp/cm-maintenance".to_string()
|
||||
}
|
||||
|
||||
impl AgentConfig {
|
||||
pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
loader::load_config(path)
|
||||
}
|
||||
|
||||
pub fn validate(&self) -> Result<()> {
|
||||
validation::validate_config(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AgentConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
zmq: ZmqConfig::default(),
|
||||
collectors: CollectorConfig::default(),
|
||||
cache: CacheConfig::default(),
|
||||
notifications: NotificationConfig::default(),
|
||||
collection_interval_seconds: DEFAULT_COLLECTION_INTERVAL_SECONDS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ZmqConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
publisher_port: DEFAULT_ZMQ_PUBLISHER_PORT,
|
||||
command_port: DEFAULT_ZMQ_COMMAND_PORT,
|
||||
bind_address: DEFAULT_ZMQ_BIND_ADDRESS.to_string(),
|
||||
timeout_ms: DEFAULT_ZMQ_TIMEOUT_MS,
|
||||
heartbeat_interval_ms: DEFAULT_ZMQ_HEARTBEAT_INTERVAL_MS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CollectorConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpu: CpuConfig::default(),
|
||||
memory: MemoryConfig::default(),
|
||||
disk: DiskConfig::default(),
|
||||
processes: ProcessConfig::default(),
|
||||
systemd: SystemdConfig::default(),
|
||||
smart: SmartConfig::default(),
|
||||
backup: BackupConfig::default(),
|
||||
network: NetworkConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CpuConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_CPU_INTERVAL_SECONDS,
|
||||
load_warning_threshold: DEFAULT_CPU_LOAD_WARNING,
|
||||
load_critical_threshold: DEFAULT_CPU_LOAD_CRITICAL,
|
||||
temperature_warning_threshold: DEFAULT_CPU_TEMP_WARNING,
|
||||
temperature_critical_threshold: DEFAULT_CPU_TEMP_CRITICAL,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MemoryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_MEMORY_INTERVAL_SECONDS,
|
||||
usage_warning_percent: DEFAULT_MEMORY_WARNING_PERCENT,
|
||||
usage_critical_percent: DEFAULT_MEMORY_CRITICAL_PERCENT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DiskConfig {
|
||||
fn default() -> Self {
|
||||
let hostname = gethostname::gethostname().to_string_lossy().to_string();
|
||||
let filesystems = get_default_filesystems_for_host(&hostname);
|
||||
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_DISK_INTERVAL_SECONDS,
|
||||
usage_warning_percent: DEFAULT_DISK_WARNING_PERCENT,
|
||||
usage_critical_percent: DEFAULT_DISK_CRITICAL_PERCENT,
|
||||
filesystems,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get default filesystem configurations for known CMTEC hosts
|
||||
fn get_default_filesystems_for_host(hostname: &str) -> Vec<FilesystemConfig> {
|
||||
match hostname {
|
||||
"cmbox" => vec![
|
||||
FilesystemConfig {
|
||||
name: "root".to_string(),
|
||||
uuid: "4cade5ce-85a5-4a03-83c8-dfd1d3888d79".to_string(),
|
||||
mount_point: "/".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "boot".to_string(),
|
||||
uuid: "AB4D-62EC".to_string(),
|
||||
mount_point: "/boot".to_string(),
|
||||
fs_type: "vfat".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
],
|
||||
"srv02" => vec![
|
||||
FilesystemConfig {
|
||||
name: "root".to_string(),
|
||||
uuid: "5a880608-c79f-458f-a031-30206aa27ca7".to_string(),
|
||||
mount_point: "/".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "boot".to_string(),
|
||||
uuid: "6B2E-2AD9".to_string(),
|
||||
mount_point: "/boot".to_string(),
|
||||
fs_type: "vfat".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
],
|
||||
"simonbox" => vec![
|
||||
FilesystemConfig {
|
||||
name: "root".to_string(),
|
||||
uuid: "b74284a9-2899-4f71-bdb0-fd07dc4baab3".to_string(),
|
||||
mount_point: "/".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "boot".to_string(),
|
||||
uuid: "F6A3-AD2B".to_string(),
|
||||
mount_point: "/boot".to_string(),
|
||||
fs_type: "vfat".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "steampool_1".to_string(),
|
||||
uuid: "09300cb7-0938-4dba-8a42-7a7aaf60db51".to_string(),
|
||||
mount_point: "/steampool_1".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "steampool_2".to_string(),
|
||||
uuid: "a2d61a41-3f2a-4760-b62e-5eb8caf50d1a".to_string(),
|
||||
mount_point: "/steampool_2".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
],
|
||||
"steambox" => vec![
|
||||
FilesystemConfig {
|
||||
name: "root".to_string(),
|
||||
uuid: "4514ca9f-2d0a-40df-b14b-e342f39c3e6a".to_string(),
|
||||
mount_point: "/".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "boot".to_string(),
|
||||
uuid: "8FD2-1B13".to_string(),
|
||||
mount_point: "/boot".to_string(),
|
||||
fs_type: "vfat".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "steampool".to_string(),
|
||||
uuid: "0ebe8abb-bbe7-4224-947b-86bf38981f60".to_string(),
|
||||
mount_point: "/mnt/steampool".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
],
|
||||
"srv01" => vec![
|
||||
FilesystemConfig {
|
||||
name: "root".to_string(),
|
||||
uuid: "cd98df34-03a3-4d68-8338-d90d2920f9f8".to_string(),
|
||||
mount_point: "/".to_string(),
|
||||
fs_type: "ext4".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
FilesystemConfig {
|
||||
name: "boot".to_string(),
|
||||
uuid: "13E1-4DDE".to_string(),
|
||||
mount_point: "/boot".to_string(),
|
||||
fs_type: "vfat".to_string(),
|
||||
monitor: true,
|
||||
},
|
||||
],
|
||||
// labbox and wslbox have no UUIDs configured yet
|
||||
"labbox" | "wslbox" => {
|
||||
Vec::new()
|
||||
},
|
||||
_ => {
|
||||
// Unknown hosts use auto-discovery
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ProcessConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_PROCESS_INTERVAL_SECONDS,
|
||||
top_processes_count: DEFAULT_TOP_PROCESSES_COUNT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SystemdConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_SYSTEMD_INTERVAL_SECONDS,
|
||||
auto_discover: true,
|
||||
services: Vec::new(),
|
||||
memory_warning_mb: DEFAULT_SERVICE_MEMORY_WARNING_MB,
|
||||
memory_critical_mb: DEFAULT_SERVICE_MEMORY_CRITICAL_MB,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SmartConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_SMART_INTERVAL_SECONDS,
|
||||
temperature_warning_celsius: DEFAULT_SMART_TEMP_WARNING,
|
||||
temperature_critical_celsius: DEFAULT_SMART_TEMP_CRITICAL,
|
||||
wear_warning_percent: DEFAULT_SMART_WEAR_WARNING,
|
||||
wear_critical_percent: DEFAULT_SMART_WEAR_CRITICAL,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BackupConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_BACKUP_INTERVAL_SECONDS,
|
||||
backup_paths: Vec::new(),
|
||||
max_age_hours: DEFAULT_BACKUP_MAX_AGE_HOURS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for NetworkConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
interval_seconds: DEFAULT_NETWORK_INTERVAL_SECONDS,
|
||||
interfaces: Vec::new(),
|
||||
auto_discover: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for NotificationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
smtp_host: DEFAULT_SMTP_HOST.to_string(),
|
||||
smtp_port: DEFAULT_SMTP_PORT,
|
||||
from_email: DEFAULT_FROM_EMAIL.to_string(),
|
||||
to_email: DEFAULT_TO_EMAIL.to_string(),
|
||||
rate_limit_minutes: DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,114 +1,123 @@
|
||||
use anyhow::{bail, Result};
|
||||
use crate::config::AgentConfig;
|
||||
use anyhow::{bail, Result};
|
||||
|
||||
pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||
// Validate ZMQ configuration
|
||||
if config.zmq.publisher_port == 0 {
|
||||
bail!("ZMQ publisher port cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.command_port == 0 {
|
||||
bail!("ZMQ command port cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.publisher_port == config.zmq.command_port {
|
||||
bail!("ZMQ publisher and command ports cannot be the same");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.bind_address.is_empty() {
|
||||
bail!("ZMQ bind address cannot be empty");
|
||||
}
|
||||
|
||||
if config.zmq.timeout_ms == 0 {
|
||||
bail!("ZMQ timeout cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
// Validate collection interval
|
||||
if config.collection_interval_seconds == 0 {
|
||||
bail!("Collection interval cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
// Validate CPU thresholds
|
||||
if config.collectors.cpu.enabled {
|
||||
if config.collectors.cpu.load_warning_threshold <= 0.0 {
|
||||
bail!("CPU load warning threshold must be positive");
|
||||
}
|
||||
|
||||
if config.collectors.cpu.load_critical_threshold <= config.collectors.cpu.load_warning_threshold {
|
||||
|
||||
if config.collectors.cpu.load_critical_threshold
|
||||
<= config.collectors.cpu.load_warning_threshold
|
||||
{
|
||||
bail!("CPU load critical threshold must be greater than warning threshold");
|
||||
}
|
||||
|
||||
|
||||
if config.collectors.cpu.temperature_warning_threshold <= 0.0 {
|
||||
bail!("CPU temperature warning threshold must be positive");
|
||||
}
|
||||
|
||||
if config.collectors.cpu.temperature_critical_threshold <= config.collectors.cpu.temperature_warning_threshold {
|
||||
|
||||
if config.collectors.cpu.temperature_critical_threshold
|
||||
<= config.collectors.cpu.temperature_warning_threshold
|
||||
{
|
||||
bail!("CPU temperature critical threshold must be greater than warning threshold");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate memory thresholds
|
||||
if config.collectors.memory.enabled {
|
||||
if config.collectors.memory.usage_warning_percent <= 0.0 || config.collectors.memory.usage_warning_percent > 100.0 {
|
||||
if config.collectors.memory.usage_warning_percent <= 0.0
|
||||
|| config.collectors.memory.usage_warning_percent > 100.0
|
||||
{
|
||||
bail!("Memory usage warning threshold must be between 0 and 100");
|
||||
}
|
||||
|
||||
if config.collectors.memory.usage_critical_percent <= config.collectors.memory.usage_warning_percent
|
||||
|| config.collectors.memory.usage_critical_percent > 100.0 {
|
||||
|
||||
if config.collectors.memory.usage_critical_percent
|
||||
<= config.collectors.memory.usage_warning_percent
|
||||
|| config.collectors.memory.usage_critical_percent > 100.0
|
||||
{
|
||||
bail!("Memory usage critical threshold must be between warning threshold and 100");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate disk thresholds
|
||||
if config.collectors.disk.enabled {
|
||||
if config.collectors.disk.usage_warning_percent <= 0.0 || config.collectors.disk.usage_warning_percent > 100.0 {
|
||||
if config.collectors.disk.usage_warning_percent <= 0.0
|
||||
|| config.collectors.disk.usage_warning_percent > 100.0
|
||||
{
|
||||
bail!("Disk usage warning threshold must be between 0 and 100");
|
||||
}
|
||||
|
||||
if config.collectors.disk.usage_critical_percent <= config.collectors.disk.usage_warning_percent
|
||||
|| config.collectors.disk.usage_critical_percent > 100.0 {
|
||||
|
||||
if config.collectors.disk.usage_critical_percent
|
||||
<= config.collectors.disk.usage_warning_percent
|
||||
|| config.collectors.disk.usage_critical_percent > 100.0
|
||||
{
|
||||
bail!("Disk usage critical threshold must be between warning threshold and 100");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate systemd configuration
|
||||
if config.collectors.systemd.enabled {
|
||||
if config.collectors.systemd.nginx_latency_critical_ms <= 0.0 {
|
||||
bail!("Nginx latency critical threshold must be positive");
|
||||
}
|
||||
}
|
||||
|
||||
// Validate SMTP configuration
|
||||
if config.notifications.enabled {
|
||||
if config.notifications.smtp_host.is_empty() {
|
||||
bail!("SMTP host cannot be empty when notifications are enabled");
|
||||
}
|
||||
|
||||
|
||||
if config.notifications.smtp_port == 0 {
|
||||
bail!("SMTP port cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.notifications.from_email.is_empty() {
|
||||
bail!("From email cannot be empty when notifications are enabled");
|
||||
}
|
||||
|
||||
|
||||
if config.notifications.to_email.is_empty() {
|
||||
bail!("To email cannot be empty when notifications are enabled");
|
||||
}
|
||||
|
||||
|
||||
// Basic email validation
|
||||
if !config.notifications.from_email.contains('@') {
|
||||
bail!("From email must contain @ symbol");
|
||||
}
|
||||
|
||||
|
||||
if !config.notifications.to_email.contains('@') {
|
||||
bail!("To email must contain @ symbol");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate cache configuration
|
||||
if config.cache.enabled {
|
||||
if config.cache.default_ttl_seconds == 0 {
|
||||
bail!("Cache TTL cannot be 0");
|
||||
}
|
||||
|
||||
if config.cache.max_entries == 0 {
|
||||
bail!("Cache max entries cannot be 0");
|
||||
}
|
||||
if config.cache.persist_path.is_empty() {
|
||||
bail!("Cache persist path cannot be empty");
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,62 +1,75 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use tracing::{info, error};
|
||||
use tracing::{error, info};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
mod agent;
|
||||
mod cache;
|
||||
mod config;
|
||||
mod communication;
|
||||
mod metrics;
|
||||
mod collectors;
|
||||
mod communication;
|
||||
mod config;
|
||||
mod notifications;
|
||||
mod utils;
|
||||
|
||||
use agent::Agent;
|
||||
|
||||
/// Get version showing cm-dashboard-agent package hash for easy deployment verification
|
||||
fn get_version() -> &'static str {
|
||||
// Get the path of the current executable
|
||||
let exe_path = std::env::current_exe().expect("Failed to get executable path");
|
||||
let exe_str = exe_path.to_string_lossy();
|
||||
|
||||
// Extract Nix store hash from path like /nix/store/HASH-cm-dashboard-v0.1.8/bin/cm-dashboard-agent
|
||||
let hash_part = exe_str.strip_prefix("/nix/store/").expect("Not a nix store path");
|
||||
let hash = hash_part.split('-').next().expect("Invalid nix store path format");
|
||||
assert!(hash.len() >= 8, "Hash too short");
|
||||
|
||||
// Return first 8 characters of nix store hash
|
||||
let short_hash = hash[..8].to_string();
|
||||
Box::leak(short_hash.into_boxed_str())
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "cm-dashboard-agent")]
|
||||
#[command(about = "CM Dashboard metrics agent with individual metric collection")]
|
||||
#[command(version)]
|
||||
#[command(version = get_version())]
|
||||
struct Cli {
|
||||
/// Increase logging verbosity (-v, -vv)
|
||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||
verbose: u8,
|
||||
|
||||
/// Configuration file path
|
||||
|
||||
/// Configuration file path (required)
|
||||
#[arg(short, long)]
|
||||
config: Option<String>,
|
||||
config: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
|
||||
// Setup logging
|
||||
let log_level = match cli.verbose {
|
||||
0 => "info",
|
||||
1 => "debug",
|
||||
1 => "debug",
|
||||
_ => "trace",
|
||||
};
|
||||
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
|
||||
.init();
|
||||
|
||||
|
||||
info!("CM Dashboard Agent starting with individual metrics architecture...");
|
||||
|
||||
|
||||
// Create and run agent
|
||||
let mut agent = Agent::new(cli.config).await?;
|
||||
|
||||
let mut agent = Agent::new(Some(cli.config)).await?;
|
||||
|
||||
// Setup graceful shutdown channel
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
|
||||
let ctrl_c = async {
|
||||
tokio::signal::ctrl_c()
|
||||
.await
|
||||
.expect("failed to install Ctrl+C handler");
|
||||
};
|
||||
|
||||
|
||||
// Run agent with graceful shutdown
|
||||
tokio::select! {
|
||||
result = agent.run(shutdown_rx) => {
|
||||
@@ -72,7 +85,7 @@ async fn main() -> Result<()> {
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
info!("Agent shutdown complete");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +1,32 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::Metric;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::cache::MetricCacheManager;
|
||||
use crate::collectors::{
|
||||
backup::BackupCollector, cpu::CpuCollector, disk::DiskCollector, memory::MemoryCollector,
|
||||
systemd::SystemdCollector, Collector,
|
||||
nixos::NixOSCollector, systemd::SystemdCollector, Collector,
|
||||
};
|
||||
use crate::config::{AgentConfig, CollectorConfig};
|
||||
|
||||
/// Manages all metric collectors with intelligent caching
|
||||
/// Collector with timing information
|
||||
struct TimedCollector {
|
||||
collector: Box<dyn Collector>,
|
||||
interval: Duration,
|
||||
last_collection: Option<Instant>,
|
||||
name: String,
|
||||
}
|
||||
|
||||
/// Manages all metric collectors with individual intervals
|
||||
pub struct MetricCollectionManager {
|
||||
collectors: Vec<Box<dyn Collector>>,
|
||||
cache_manager: MetricCacheManager,
|
||||
last_collection_times: HashMap<String, Instant>,
|
||||
collectors: Vec<TimedCollector>,
|
||||
status_tracker: StatusTracker,
|
||||
cached_metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
impl MetricCollectionManager {
|
||||
pub async fn new(config: &CollectorConfig, agent_config: &AgentConfig) -> Result<Self> {
|
||||
let mut collectors: Vec<Box<dyn Collector>> = Vec::new();
|
||||
pub async fn new(config: &CollectorConfig, _agent_config: &AgentConfig) -> Result<Self> {
|
||||
let mut collectors: Vec<TimedCollector> = Vec::new();
|
||||
|
||||
// Benchmark mode - only enable specific collector based on env var
|
||||
let benchmark_mode = std::env::var("BENCHMARK_COLLECTOR").ok();
|
||||
@@ -30,7 +36,12 @@ impl MetricCollectionManager {
|
||||
// CPU collector only
|
||||
if config.cpu.enabled {
|
||||
let cpu_collector = CpuCollector::new(config.cpu.clone());
|
||||
collectors.push(Box::new(cpu_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(cpu_collector),
|
||||
interval: Duration::from_secs(config.cpu.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "CPU".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: CPU collector only");
|
||||
}
|
||||
}
|
||||
@@ -38,20 +49,35 @@ impl MetricCollectionManager {
|
||||
// Memory collector only
|
||||
if config.memory.enabled {
|
||||
let memory_collector = MemoryCollector::new(config.memory.clone());
|
||||
collectors.push(Box::new(memory_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(memory_collector),
|
||||
interval: Duration::from_secs(config.memory.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Memory".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Memory collector only");
|
||||
}
|
||||
}
|
||||
Some("disk") => {
|
||||
// Disk collector only
|
||||
let disk_collector = DiskCollector::new(config.disk.clone());
|
||||
collectors.push(Box::new(disk_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(disk_collector),
|
||||
interval: Duration::from_secs(config.disk.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Disk".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Disk collector only");
|
||||
}
|
||||
Some("systemd") => {
|
||||
// Systemd collector only
|
||||
let systemd_collector = SystemdCollector::new();
|
||||
collectors.push(Box::new(systemd_collector));
|
||||
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(systemd_collector),
|
||||
interval: Duration::from_secs(config.systemd.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Systemd".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Systemd collector only");
|
||||
}
|
||||
Some("backup") => {
|
||||
@@ -61,7 +87,12 @@ impl MetricCollectionManager {
|
||||
config.backup.backup_paths.first().cloned(),
|
||||
config.backup.max_age_hours,
|
||||
);
|
||||
collectors.push(Box::new(backup_collector));
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(backup_collector),
|
||||
interval: Duration::from_secs(config.backup.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Backup".to_string(),
|
||||
});
|
||||
info!("BENCHMARK: Backup collector only");
|
||||
}
|
||||
}
|
||||
@@ -73,50 +104,81 @@ impl MetricCollectionManager {
|
||||
// Normal mode - all collectors
|
||||
if config.cpu.enabled {
|
||||
let cpu_collector = CpuCollector::new(config.cpu.clone());
|
||||
collectors.push(Box::new(cpu_collector));
|
||||
info!("CPU collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(cpu_collector),
|
||||
interval: Duration::from_secs(config.cpu.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "CPU".to_string(),
|
||||
});
|
||||
info!("CPU collector initialized with {}s interval", config.cpu.interval_seconds);
|
||||
}
|
||||
|
||||
if config.memory.enabled {
|
||||
let memory_collector = MemoryCollector::new(config.memory.clone());
|
||||
collectors.push(Box::new(memory_collector));
|
||||
info!("Memory collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(memory_collector),
|
||||
interval: Duration::from_secs(config.memory.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Memory".to_string(),
|
||||
});
|
||||
info!("Memory collector initialized with {}s interval", config.memory.interval_seconds);
|
||||
}
|
||||
|
||||
let disk_collector = DiskCollector::new(config.disk.clone());
|
||||
collectors.push(Box::new(disk_collector));
|
||||
info!("Disk collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(disk_collector),
|
||||
interval: Duration::from_secs(config.disk.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Disk".to_string(),
|
||||
});
|
||||
info!("Disk collector initialized with {}s interval", config.disk.interval_seconds);
|
||||
|
||||
let systemd_collector = SystemdCollector::new();
|
||||
collectors.push(Box::new(systemd_collector));
|
||||
info!("Systemd collector initialized");
|
||||
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(systemd_collector),
|
||||
interval: Duration::from_secs(config.systemd.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Systemd".to_string(),
|
||||
});
|
||||
info!("Systemd collector initialized with {}s interval", config.systemd.interval_seconds);
|
||||
|
||||
if config.backup.enabled {
|
||||
let backup_collector = BackupCollector::new(
|
||||
config.backup.backup_paths.first().cloned(),
|
||||
config.backup.max_age_hours,
|
||||
);
|
||||
collectors.push(Box::new(backup_collector));
|
||||
info!("Backup collector initialized");
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(backup_collector),
|
||||
interval: Duration::from_secs(config.backup.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "Backup".to_string(),
|
||||
});
|
||||
info!("Backup collector initialized with {}s interval", config.backup.interval_seconds);
|
||||
}
|
||||
|
||||
if config.nixos.enabled {
|
||||
let nixos_collector = NixOSCollector::new(config.nixos.clone());
|
||||
collectors.push(TimedCollector {
|
||||
collector: Box::new(nixos_collector),
|
||||
interval: Duration::from_secs(config.nixos.interval_seconds),
|
||||
last_collection: None,
|
||||
name: "NixOS".to_string(),
|
||||
});
|
||||
info!("NixOS collector initialized with {}s interval", config.nixos.interval_seconds);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize cache manager with configuration
|
||||
let cache_manager = MetricCacheManager::new(agent_config.cache.clone());
|
||||
|
||||
// Start background cache tasks
|
||||
cache_manager.start_background_tasks().await;
|
||||
|
||||
info!(
|
||||
"Metric collection manager initialized with {} collectors and caching enabled",
|
||||
"Metric collection manager initialized with {} collectors",
|
||||
collectors.len()
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
collectors,
|
||||
cache_manager,
|
||||
last_collection_times: HashMap::new(),
|
||||
status_tracker: StatusTracker::new(),
|
||||
cached_metrics: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -125,154 +187,80 @@ impl MetricCollectionManager {
|
||||
let mut all_metrics = Vec::new();
|
||||
let now = Instant::now();
|
||||
|
||||
info!(
|
||||
"Force collecting from ALL {} collectors for startup",
|
||||
self.collectors.len()
|
||||
);
|
||||
|
||||
// Force collection from every collector regardless of intervals
|
||||
for collector in &self.collectors {
|
||||
let collector_name = collector.name();
|
||||
|
||||
match collector.collect().await {
|
||||
for timed_collector in &mut self.collectors {
|
||||
match timed_collector.collector.collect(&mut self.status_tracker).await {
|
||||
Ok(metrics) => {
|
||||
info!(
|
||||
"Force collected {} metrics from {} collector",
|
||||
metrics.len(),
|
||||
collector_name
|
||||
);
|
||||
|
||||
// Cache all new metrics
|
||||
for metric in &metrics {
|
||||
self.cache_manager.cache_metric(metric.clone()).await;
|
||||
}
|
||||
|
||||
let metric_count = metrics.len();
|
||||
all_metrics.extend(metrics);
|
||||
self.last_collection_times
|
||||
.insert(collector_name.to_string(), now);
|
||||
timed_collector.last_collection = Some(now);
|
||||
debug!("Force collected {} metrics from {}", metric_count, timed_collector.name);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Collector '{}' failed during force collection: {}",
|
||||
collector_name, e
|
||||
);
|
||||
// Continue with other collectors even if one fails
|
||||
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Force collection completed: {} total metrics cached",
|
||||
all_metrics.len()
|
||||
);
|
||||
|
||||
// Cache the collected metrics
|
||||
self.cached_metrics = all_metrics.clone();
|
||||
Ok(all_metrics)
|
||||
}
|
||||
|
||||
/// Collect metrics from all collectors with intelligent caching
|
||||
pub async fn collect_all_metrics(&mut self) -> Result<Vec<Metric>> {
|
||||
/// Collect metrics from collectors whose intervals have elapsed
|
||||
pub async fn collect_metrics_timed(&mut self) -> Result<Vec<Metric>> {
|
||||
let mut all_metrics = Vec::new();
|
||||
let now = Instant::now();
|
||||
|
||||
// Collecting metrics from collectors (debug logging disabled for performance)
|
||||
|
||||
// Keep track of which collector types we're collecting fresh data from
|
||||
let mut collecting_fresh = std::collections::HashSet::new();
|
||||
|
||||
// For each collector, check if we need to collect based on time intervals
|
||||
for collector in &self.collectors {
|
||||
let collector_name = collector.name();
|
||||
|
||||
// Determine cache interval for this collector type - ALL REALTIME FOR FAST UPDATES
|
||||
let cache_interval_secs = match collector_name {
|
||||
"cpu" | "memory" | "disk" | "systemd" => 2, // All realtime for fast updates
|
||||
"backup" => 10, // Backup metrics every 10 seconds for testing
|
||||
_ => 2, // All realtime for fast updates
|
||||
for timed_collector in &mut self.collectors {
|
||||
let should_collect = match timed_collector.last_collection {
|
||||
None => true, // First collection
|
||||
Some(last_time) => now.duration_since(last_time) >= timed_collector.interval,
|
||||
};
|
||||
|
||||
let should_collect =
|
||||
if let Some(last_time) = self.last_collection_times.get(collector_name) {
|
||||
now.duration_since(*last_time).as_secs() >= cache_interval_secs
|
||||
} else {
|
||||
true // First collection
|
||||
};
|
||||
|
||||
if should_collect {
|
||||
collecting_fresh.insert(collector_name.to_string());
|
||||
match collector.collect().await {
|
||||
match timed_collector.collector.collect(&mut self.status_tracker).await {
|
||||
Ok(metrics) => {
|
||||
// Collector returned fresh metrics (debug logging disabled for performance)
|
||||
|
||||
// Cache all new metrics
|
||||
for metric in &metrics {
|
||||
self.cache_manager.cache_metric(metric.clone()).await;
|
||||
}
|
||||
|
||||
let metric_count = metrics.len();
|
||||
all_metrics.extend(metrics);
|
||||
self.last_collection_times
|
||||
.insert(collector_name.to_string(), now);
|
||||
timed_collector.last_collection = Some(now);
|
||||
debug!(
|
||||
"Collected {} metrics from {} ({}s interval)",
|
||||
metric_count,
|
||||
timed_collector.name,
|
||||
timed_collector.interval.as_secs()
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Collector '{}' failed: {}", collector_name, e);
|
||||
// Continue with other collectors even if one fails
|
||||
error!("Collector {} failed: {}", timed_collector.name, e);
|
||||
// Update last_collection time even on failure to prevent immediate retries
|
||||
timed_collector.last_collection = Some(now);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let _elapsed = self
|
||||
.last_collection_times
|
||||
.get(collector_name)
|
||||
.map(|t| now.duration_since(*t).as_secs())
|
||||
.unwrap_or(0);
|
||||
// Collector skipped (debug logging disabled for performance)
|
||||
}
|
||||
}
|
||||
|
||||
// For 2-second intervals, skip cached metrics to avoid duplicates
|
||||
// (Cache system disabled for realtime updates)
|
||||
|
||||
// Collected metrics total (debug logging disabled for performance)
|
||||
|
||||
// Update cache with newly collected metrics
|
||||
if !all_metrics.is_empty() {
|
||||
// Merge new metrics with cached metrics (replace by name)
|
||||
for new_metric in &all_metrics {
|
||||
// Remove any existing metric with the same name
|
||||
self.cached_metrics.retain(|cached| cached.name != new_metric.name);
|
||||
// Add the new metric
|
||||
self.cached_metrics.push(new_metric.clone());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_metrics)
|
||||
}
|
||||
|
||||
/// Get names of all registered collectors
|
||||
pub fn get_collector_names(&self) -> Vec<String> {
|
||||
self.collectors
|
||||
.iter()
|
||||
.map(|c| c.name().to_string())
|
||||
.collect()
|
||||
/// Collect metrics from all collectors (legacy method for compatibility)
|
||||
pub async fn collect_all_metrics(&mut self) -> Result<Vec<Metric>> {
|
||||
self.collect_metrics_timed().await
|
||||
}
|
||||
|
||||
/// Get cached metrics without triggering fresh collection
|
||||
pub fn get_cached_metrics(&self) -> Vec<Metric> {
|
||||
self.cached_metrics.clone()
|
||||
}
|
||||
|
||||
/// Get collector statistics
|
||||
pub fn get_stats(&self) -> HashMap<String, bool> {
|
||||
self.collectors
|
||||
.iter()
|
||||
.map(|c| (c.name().to_string(), true)) // All collectors are enabled
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get all cached metrics from the cache manager
|
||||
pub async fn get_all_cached_metrics(&self) -> Result<Vec<Metric>> {
|
||||
let cached_metrics = self.cache_manager.get_all_cached_metrics().await;
|
||||
debug!(
|
||||
"Retrieved {} cached metrics for broadcast",
|
||||
cached_metrics.len()
|
||||
);
|
||||
Ok(cached_metrics)
|
||||
}
|
||||
|
||||
/// Determine which collector handles a specific metric
|
||||
fn get_collector_for_metric(&self, metric_name: &str) -> String {
|
||||
if metric_name.starts_with("cpu_") {
|
||||
"cpu".to_string()
|
||||
} else if metric_name.starts_with("memory_") {
|
||||
"memory".to_string()
|
||||
} else if metric_name.starts_with("disk_") {
|
||||
"disk".to_string()
|
||||
} else if metric_name.starts_with("service_") {
|
||||
"systemd".to_string()
|
||||
} else if metric_name.starts_with("backup_") {
|
||||
"backup".to_string()
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,147 +1,64 @@
|
||||
use cm_dashboard_shared::Status;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
use tracing::{info, debug};
|
||||
|
||||
use crate::config::NotificationConfig;
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use lettre::transport::smtp::SmtpTransport;
|
||||
use lettre::{Message, Transport};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
/// Manages status change tracking and notifications
|
||||
/// Manages notifications
|
||||
pub struct NotificationManager {
|
||||
config: NotificationConfig,
|
||||
hostname: String,
|
||||
metric_statuses: HashMap<String, Status>,
|
||||
last_notification_times: HashMap<String, Instant>,
|
||||
}
|
||||
|
||||
/// Status change information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StatusChange {
|
||||
pub metric_name: String,
|
||||
pub old_status: Status,
|
||||
pub new_status: Status,
|
||||
pub timestamp: Instant,
|
||||
}
|
||||
|
||||
impl NotificationManager {
|
||||
pub fn new(config: &NotificationConfig, hostname: &str) -> Result<Self, anyhow::Error> {
|
||||
info!("Initializing notification manager for {}", hostname);
|
||||
|
||||
pub fn new(config: &NotificationConfig, _hostname: &str) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config: config.clone(),
|
||||
hostname: hostname.to_string(),
|
||||
metric_statuses: HashMap::new(),
|
||||
last_notification_times: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Update metric status and return status change if any
|
||||
pub fn update_metric_status(&mut self, metric_name: &str, new_status: Status) -> Option<StatusChange> {
|
||||
let old_status = self.metric_statuses.get(metric_name).copied().unwrap_or(Status::Unknown);
|
||||
|
||||
// Update stored status
|
||||
self.metric_statuses.insert(metric_name.to_string(), new_status);
|
||||
|
||||
// Check if status actually changed
|
||||
if old_status != new_status {
|
||||
debug!("Status change detected for {}: {:?} -> {:?}", metric_name, old_status, new_status);
|
||||
|
||||
Some(StatusChange {
|
||||
metric_name: metric_name.to_string(),
|
||||
old_status,
|
||||
new_status,
|
||||
timestamp: Instant::now(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Send notification for status change (placeholder implementation)
|
||||
pub async fn send_status_change_notification(
|
||||
&mut self,
|
||||
status_change: StatusChange,
|
||||
metric: &cm_dashboard_shared::Metric,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
|
||||
pub async fn send_direct_email(&mut self, subject: &str, body: &str) -> Result<()> {
|
||||
if !self.config.enabled {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Check rate limiting
|
||||
if self.is_rate_limited(&status_change.metric_name) {
|
||||
debug!("Notification rate limited for {}", status_change.metric_name);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Check maintenance mode
|
||||
|
||||
if self.is_maintenance_mode() {
|
||||
debug!("Maintenance mode active, suppressing notification for {}", status_change.metric_name);
|
||||
debug!("Maintenance mode active, suppressing email notification");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let hostname = gethostname::gethostname()
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
let from_email = self.config.from_email.replace("{hostname}", &hostname);
|
||||
|
||||
info!("Would send notification for {}: {:?} -> {:?}",
|
||||
status_change.metric_name, status_change.old_status, status_change.new_status);
|
||||
|
||||
// TODO: Implement actual email sending using lettre
|
||||
// For now, just log the notification
|
||||
self.log_notification(&status_change, metric);
|
||||
|
||||
// Update last notification time
|
||||
self.last_notification_times.insert(
|
||||
status_change.metric_name.clone(),
|
||||
status_change.timestamp
|
||||
let email_body = format!(
|
||||
"{}\n\n--\nCM Dashboard Agent\nGenerated at {}",
|
||||
body,
|
||||
Utc::now().format("%Y-%m-%d %H:%M:%S %Z")
|
||||
);
|
||||
|
||||
|
||||
let email = Message::builder()
|
||||
.from(from_email.parse()?)
|
||||
.to(self.config.to_email.parse()?)
|
||||
.subject(subject)
|
||||
.body(email_body)?;
|
||||
|
||||
let mailer = SmtpTransport::unencrypted_localhost();
|
||||
|
||||
match mailer.send(&email) {
|
||||
Ok(_) => info!("Direct email sent successfully: {}", subject),
|
||||
Err(e) => {
|
||||
error!("Failed to send email: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if maintenance mode is active
|
||||
|
||||
fn is_maintenance_mode(&self) -> bool {
|
||||
std::fs::metadata("/tmp/cm-maintenance").is_ok()
|
||||
}
|
||||
|
||||
/// Check if notification is rate limited
|
||||
fn is_rate_limited(&self, metric_name: &str) -> bool {
|
||||
if self.config.rate_limit_minutes == 0 {
|
||||
return false; // No rate limiting
|
||||
}
|
||||
|
||||
if let Some(last_time) = self.last_notification_times.get(metric_name) {
|
||||
let elapsed = last_time.elapsed();
|
||||
let rate_limit_duration = std::time::Duration::from_secs(self.config.rate_limit_minutes * 60);
|
||||
|
||||
elapsed < rate_limit_duration
|
||||
} else {
|
||||
false // No previous notification
|
||||
}
|
||||
}
|
||||
|
||||
/// Log notification details
|
||||
fn log_notification(&self, status_change: &StatusChange, metric: &cm_dashboard_shared::Metric) {
|
||||
let status_description = match status_change.new_status {
|
||||
Status::Ok => "recovered",
|
||||
Status::Warning => "warning",
|
||||
Status::Critical => "critical",
|
||||
Status::Unknown => "unknown",
|
||||
};
|
||||
|
||||
info!(
|
||||
"NOTIFICATION: {} on {}: {} is {} (value: {})",
|
||||
status_description,
|
||||
self.hostname,
|
||||
status_change.metric_name,
|
||||
status_description,
|
||||
metric.value.as_string()
|
||||
);
|
||||
}
|
||||
|
||||
/// Process any pending notifications (placeholder)
|
||||
pub async fn process_pending(&mut self) {
|
||||
// Placeholder for batch notification processing
|
||||
// Could be used for email queue processing, etc.
|
||||
}
|
||||
|
||||
/// Get current metric statuses
|
||||
pub fn get_metric_statuses(&self) -> &HashMap<String, Status> {
|
||||
&self.metric_statuses
|
||||
std::fs::metadata(&self.config.maintenance_mode_file).is_ok()
|
||||
}
|
||||
}
|
||||
422
agent/src/status/mod.rs
Normal file
422
agent/src/status/mod.rs
Normal file
@@ -0,0 +1,422 @@
|
||||
use cm_dashboard_shared::{Status, Metric};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, info, error};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use chrono::Utc;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HostStatusConfig {
|
||||
pub enabled: bool,
|
||||
pub aggregation_method: String, // "worst_case"
|
||||
}
|
||||
|
||||
impl Default for HostStatusConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
aggregation_method: "worst_case".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StatusChangeSummary {
|
||||
pub service_name: String,
|
||||
pub initial_status: Status,
|
||||
pub final_status: Status,
|
||||
pub change_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AggregatedStatusChanges {
|
||||
pub start_time: Instant,
|
||||
pub end_time: Instant,
|
||||
pub service_summaries: Vec<StatusChangeSummary>,
|
||||
pub host_status_initial: Status,
|
||||
pub host_status_final: Status,
|
||||
pub requires_notification: bool,
|
||||
}
|
||||
|
||||
pub struct HostStatusManager {
|
||||
service_statuses: HashMap<String, Status>,
|
||||
current_host_status: Status,
|
||||
previous_host_status: Status,
|
||||
last_status_change: Option<Instant>,
|
||||
config: HostStatusConfig,
|
||||
// Notification batching
|
||||
pending_changes: HashMap<String, (Status, Status, usize)>, // service -> (initial_status, current_status, change_count)
|
||||
batch_start_time: Option<Instant>,
|
||||
batch_start_host_status: Status,
|
||||
}
|
||||
|
||||
impl HostStatusManager {
|
||||
pub fn new(config: HostStatusConfig) -> Self {
|
||||
info!("Initializing HostStatusManager with config: {:?}", config);
|
||||
Self {
|
||||
service_statuses: HashMap::new(),
|
||||
current_host_status: Status::Unknown,
|
||||
previous_host_status: Status::Unknown,
|
||||
last_status_change: None,
|
||||
config,
|
||||
pending_changes: HashMap::new(),
|
||||
batch_start_time: None,
|
||||
batch_start_host_status: Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the status of a specific service and recalculate host status
|
||||
/// Updates real-time status and buffers changes for email notifications
|
||||
pub fn update_service_status(&mut self, service: String, status: Status) {
|
||||
if !self.config.enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
let old_service_status = self.service_statuses.get(&service).copied().unwrap_or(Status::Unknown);
|
||||
|
||||
// Only proceed if status actually changed
|
||||
if old_service_status == status {
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize batch if this is the first change
|
||||
if self.batch_start_time.is_none() {
|
||||
self.batch_start_time = Some(Instant::now());
|
||||
self.batch_start_host_status = self.current_host_status;
|
||||
debug!("Starting notification batch");
|
||||
}
|
||||
|
||||
// Update real-time service status (for dashboard)
|
||||
self.service_statuses.insert(service.clone(), status);
|
||||
|
||||
// Buffer change for email notifications
|
||||
match self.pending_changes.entry(service.clone()) {
|
||||
std::collections::hash_map::Entry::Occupied(mut entry) => {
|
||||
// Service already has changes in this batch - update final status and increment count
|
||||
let (initial_status, _current_status, change_count) = entry.get();
|
||||
entry.insert((*initial_status, status, change_count + 1));
|
||||
}
|
||||
std::collections::hash_map::Entry::Vacant(entry) => {
|
||||
// First change for this service in this batch
|
||||
entry.insert((old_service_status, status, 1));
|
||||
}
|
||||
}
|
||||
|
||||
// Recalculate host status
|
||||
let old_host_status = self.current_host_status;
|
||||
self.previous_host_status = old_host_status;
|
||||
self.current_host_status = self.calculate_host_status();
|
||||
|
||||
if old_host_status != self.current_host_status {
|
||||
self.last_status_change = Some(Instant::now());
|
||||
info!(
|
||||
"Host status changed: {:?} -> {:?} (triggered by service '{}': {:?} -> {:?})",
|
||||
old_host_status, self.current_host_status, service, old_service_status, status
|
||||
);
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Service status updated: {} {:?} -> {:?}, host status: {:?}, pending notifications: {}",
|
||||
service, old_service_status, status, self.current_host_status, self.pending_changes.len()
|
||||
);
|
||||
}
|
||||
|
||||
/// Get the current host status as a metric for broadcasting to dashboard
|
||||
pub fn get_host_status_metric(&self) -> Metric {
|
||||
Metric {
|
||||
name: "host_status_summary".to_string(),
|
||||
value: cm_dashboard_shared::MetricValue::String(format!(
|
||||
"Host aggregated from {} services",
|
||||
self.service_statuses.len()
|
||||
)),
|
||||
status: self.current_host_status,
|
||||
timestamp: Utc::now().timestamp() as u64,
|
||||
description: Some("Aggregated host status from all services".to_string()),
|
||||
unit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate the overall host status based on all service statuses
|
||||
fn calculate_host_status(&self) -> Status {
|
||||
if self.service_statuses.is_empty() {
|
||||
return Status::Unknown;
|
||||
}
|
||||
|
||||
match self.config.aggregation_method.as_str() {
|
||||
"worst_case" => {
|
||||
let statuses: Vec<Status> = self.service_statuses.values().copied().collect();
|
||||
Status::aggregate(&statuses)
|
||||
},
|
||||
_ => {
|
||||
debug!("Unknown aggregation method: {}, falling back to worst_case", self.config.aggregation_method);
|
||||
let statuses: Vec<Status> = self.service_statuses.values().copied().collect();
|
||||
Status::aggregate(&statuses)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Process a metric - updates status and queues for aggregated notifications if status changed
|
||||
pub async fn process_metric(&mut self, metric: &Metric, _notification_manager: &mut crate::notifications::NotificationManager) -> bool {
|
||||
let old_service_status = self.service_statuses.get(&metric.name).copied();
|
||||
let old_host_status = self.current_host_status;
|
||||
let new_service_status = metric.status;
|
||||
|
||||
// Update status (this recalculates host status internally)
|
||||
self.update_service_status(metric.name.clone(), new_service_status);
|
||||
|
||||
let new_host_status = self.current_host_status;
|
||||
let mut status_changed = false;
|
||||
|
||||
// Check if service status actually changed (ignore first-time status setting)
|
||||
if let Some(old_service_status) = old_service_status {
|
||||
if old_service_status != new_service_status {
|
||||
debug!("Service status change detected for {}: {:?} -> {:?}", metric.name, old_service_status, new_service_status);
|
||||
|
||||
// Queue change for aggregated notification (not immediate)
|
||||
self.queue_status_change(&metric.name, old_service_status, new_service_status);
|
||||
|
||||
status_changed = true;
|
||||
}
|
||||
} else {
|
||||
debug!("Initial status set for {}: {:?}", metric.name, new_service_status);
|
||||
}
|
||||
|
||||
// Check if host status changed (this should trigger immediate transmission)
|
||||
if old_host_status != new_host_status {
|
||||
debug!("Host status change detected: {:?} -> {:?}", old_host_status, new_host_status);
|
||||
status_changed = true;
|
||||
}
|
||||
|
||||
status_changed // Return true if either service or host status changed
|
||||
}
|
||||
|
||||
/// Queue status change for aggregated notification
|
||||
fn queue_status_change(&mut self, metric_name: &str, old_status: Status, new_status: Status) {
|
||||
// Add to pending changes for aggregated notification
|
||||
let entry = self.pending_changes.entry(metric_name.to_string()).or_insert((old_status, old_status, 0));
|
||||
entry.1 = new_status; // Update final status
|
||||
entry.2 += 1; // Increment change count
|
||||
|
||||
// Set batch start time if this is the first change
|
||||
if self.batch_start_time.is_none() {
|
||||
self.batch_start_time = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Process pending notifications - legacy method, now rarely used
|
||||
pub async fn process_pending_notifications(&mut self, notification_manager: &mut crate::notifications::NotificationManager) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
if !self.config.enabled || self.pending_changes.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Process notifications immediately without interval batching
|
||||
|
||||
// Create aggregated status changes
|
||||
let aggregated = self.create_aggregated_changes();
|
||||
|
||||
if aggregated.requires_notification {
|
||||
info!("Sending aggregated notification for {} service changes", aggregated.service_summaries.len());
|
||||
|
||||
// Send aggregated notification
|
||||
if let Err(e) = self.send_aggregated_email(&aggregated, notification_manager).await {
|
||||
error!("Failed to send aggregated notification: {}", e);
|
||||
}
|
||||
} else {
|
||||
debug!("No significant changes requiring notification in batch of {} changes", self.pending_changes.len());
|
||||
}
|
||||
|
||||
// Clear the batch
|
||||
self.clear_notification_batch();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create aggregated status changes from pending buffer
|
||||
fn create_aggregated_changes(&self) -> AggregatedStatusChanges {
|
||||
let mut service_summaries = Vec::new();
|
||||
let mut requires_notification = false;
|
||||
|
||||
for (service_name, (initial_status, final_status, change_count)) in &self.pending_changes {
|
||||
let significant_change = self.is_significant_change(*initial_status, *final_status);
|
||||
if significant_change {
|
||||
requires_notification = true;
|
||||
}
|
||||
|
||||
service_summaries.push(StatusChangeSummary {
|
||||
service_name: service_name.clone(),
|
||||
initial_status: *initial_status,
|
||||
final_status: *final_status,
|
||||
change_count: *change_count,
|
||||
});
|
||||
}
|
||||
|
||||
// Also check if host status change is significant
|
||||
if self.is_significant_change(self.batch_start_host_status, self.current_host_status) {
|
||||
requires_notification = true;
|
||||
}
|
||||
|
||||
AggregatedStatusChanges {
|
||||
start_time: self.batch_start_time.unwrap_or_else(Instant::now),
|
||||
end_time: Instant::now(),
|
||||
service_summaries,
|
||||
host_status_initial: self.batch_start_host_status,
|
||||
host_status_final: self.current_host_status,
|
||||
requires_notification,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a status change is significant enough for notification
|
||||
fn is_significant_change(&self, old_status: Status, new_status: Status) -> bool {
|
||||
match (old_status, new_status) {
|
||||
// Don't notify on transitions from Unknown (startup/restart scenario)
|
||||
(Status::Unknown, _) => false,
|
||||
// Always notify on problems (but not from Unknown)
|
||||
(_, Status::Warning) | (_, Status::Critical) => true,
|
||||
// Only notify on recovery if it's from a problem state to OK and all services are OK
|
||||
(Status::Warning | Status::Critical, Status::Ok) => self.current_host_status == Status::Ok,
|
||||
// Don't notify on other transitions
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
async fn send_aggregated_email(
|
||||
&self,
|
||||
aggregated: &AggregatedStatusChanges,
|
||||
notification_manager: &mut crate::notifications::NotificationManager,
|
||||
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let mut summary_parts = Vec::new();
|
||||
let critical_count = aggregated.service_summaries.iter().filter(|s| s.final_status == Status::Critical).count();
|
||||
let warning_count = aggregated.service_summaries.iter().filter(|s| s.final_status == Status::Warning).count();
|
||||
let recovery_count = aggregated.service_summaries.iter().filter(|s|
|
||||
matches!((s.initial_status, s.final_status), (Status::Warning | Status::Critical, Status::Ok))
|
||||
).count();
|
||||
let startup_count = aggregated.service_summaries.iter().filter(|s|
|
||||
matches!((s.initial_status, s.final_status), (Status::Unknown, Status::Ok | Status::Pending))
|
||||
).count();
|
||||
|
||||
if critical_count > 0 { summary_parts.push(format!("{} critical", critical_count)); }
|
||||
if warning_count > 0 { summary_parts.push(format!("{} warning", warning_count)); }
|
||||
if recovery_count > 0 { summary_parts.push(format!("{} recovered", recovery_count)); }
|
||||
if startup_count > 0 { summary_parts.push(format!("{} started", startup_count)); }
|
||||
|
||||
let summary_text = if summary_parts.is_empty() {
|
||||
format!("{} service changes", aggregated.service_summaries.len())
|
||||
} else {
|
||||
summary_parts.join(", ")
|
||||
};
|
||||
|
||||
let subject = format!("Status Alert: {}", summary_text);
|
||||
let body = self.format_aggregated_details(aggregated);
|
||||
|
||||
notification_manager.send_direct_email(&subject, &body).await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
/// Format details for aggregated notification
|
||||
fn format_aggregated_details(&self, aggregated: &AggregatedStatusChanges) -> String {
|
||||
let mut details = String::new();
|
||||
|
||||
let duration = aggregated.end_time.duration_since(aggregated.start_time).as_secs();
|
||||
details.push_str(&format!(
|
||||
"Status Summary ({}s duration)\n",
|
||||
duration
|
||||
));
|
||||
|
||||
if aggregated.host_status_initial != aggregated.host_status_final {
|
||||
details.push_str(&format!(
|
||||
"Host Status: {:?} → {:?}\n\n",
|
||||
aggregated.host_status_initial,
|
||||
aggregated.host_status_final
|
||||
));
|
||||
}
|
||||
|
||||
// Group services by change type
|
||||
let mut critical_changes = Vec::new();
|
||||
let mut warning_changes = Vec::new();
|
||||
let mut recovery_changes = Vec::new();
|
||||
let mut startup_changes = Vec::new();
|
||||
let mut other_changes = Vec::new();
|
||||
|
||||
for summary in &aggregated.service_summaries {
|
||||
let change_info = format!(
|
||||
"{}: {:?} → {:?}{}",
|
||||
summary.service_name,
|
||||
summary.initial_status,
|
||||
summary.final_status,
|
||||
if summary.change_count > 1 { format!(" ({} changes)", summary.change_count) } else { String::new() }
|
||||
);
|
||||
|
||||
match (summary.initial_status, summary.final_status) {
|
||||
(_, Status::Critical) => critical_changes.push(change_info),
|
||||
(_, Status::Warning) => warning_changes.push(change_info),
|
||||
(Status::Warning | Status::Critical, Status::Ok) => recovery_changes.push(change_info),
|
||||
(Status::Unknown, Status::Ok | Status::Pending) => startup_changes.push(change_info),
|
||||
_ => other_changes.push(change_info),
|
||||
}
|
||||
}
|
||||
|
||||
// Show critical problems first
|
||||
if !critical_changes.is_empty() {
|
||||
details.push_str(&format!("🔴 CRITICAL ISSUES ({}):\n", critical_changes.len()));
|
||||
for change in critical_changes {
|
||||
details.push_str(&format!(" {}\n", change));
|
||||
}
|
||||
details.push('\n');
|
||||
}
|
||||
|
||||
// Show warnings
|
||||
if !warning_changes.is_empty() {
|
||||
details.push_str(&format!("🟡 WARNINGS ({}):\n", warning_changes.len()));
|
||||
for change in warning_changes {
|
||||
details.push_str(&format!(" {}\n", change));
|
||||
}
|
||||
details.push('\n');
|
||||
}
|
||||
|
||||
// Show recoveries only if host status is now OK (all services recovered)
|
||||
if !recovery_changes.is_empty() && aggregated.host_status_final == Status::Ok {
|
||||
details.push_str(&format!("✅ RECOVERIES ({}):\n", recovery_changes.len()));
|
||||
for change in recovery_changes {
|
||||
details.push_str(&format!(" {}\n", change));
|
||||
}
|
||||
details.push('\n');
|
||||
}
|
||||
|
||||
// Show startups (usually not important but good to know)
|
||||
if !startup_changes.is_empty() {
|
||||
details.push_str(&format!("🟢 SERVICE STARTUPS ({}):\n", startup_changes.len()));
|
||||
for change in startup_changes {
|
||||
details.push_str(&format!(" {}\n", change));
|
||||
}
|
||||
details.push('\n');
|
||||
}
|
||||
|
||||
// Show other changes
|
||||
if !other_changes.is_empty() {
|
||||
details.push_str(&format!("ℹ️ OTHER CHANGES ({}):\n", other_changes.len()));
|
||||
for change in other_changes {
|
||||
details.push_str(&format!(" {}\n", change));
|
||||
}
|
||||
}
|
||||
|
||||
details
|
||||
}
|
||||
|
||||
/// Clear the notification batch
|
||||
fn clear_notification_batch(&mut self) {
|
||||
self.pending_changes.clear();
|
||||
self.batch_start_time = None;
|
||||
self.batch_start_host_status = self.current_host_status;
|
||||
debug!("Cleared notification batch");
|
||||
}
|
||||
}
|
||||
|
||||
// Tests temporarily disabled due to API changes
|
||||
// The functionality works as tested manually
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
// Tests will be updated to match the new notification batching API
|
||||
}
|
||||
@@ -1,90 +0,0 @@
|
||||
// Utility functions for the agent
|
||||
|
||||
/// System information utilities
|
||||
pub mod system {
|
||||
use std::fs;
|
||||
|
||||
/// Get number of CPU cores efficiently
|
||||
pub fn get_cpu_count() -> Result<usize, std::io::Error> {
|
||||
// Try /proc/cpuinfo first (most reliable)
|
||||
if let Ok(content) = fs::read_to_string("/proc/cpuinfo") {
|
||||
let count = content.lines()
|
||||
.filter(|line| line.starts_with("processor"))
|
||||
.count();
|
||||
|
||||
if count > 0 {
|
||||
return Ok(count);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to nproc equivalent
|
||||
match std::thread::available_parallelism() {
|
||||
Ok(count) => Ok(count.get()),
|
||||
Err(_) => Ok(1), // Default to 1 core if all else fails
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if running in container
|
||||
pub fn is_container() -> bool {
|
||||
// Check for common container indicators
|
||||
fs::metadata("/.dockerenv").is_ok() ||
|
||||
fs::read_to_string("/proc/1/cgroup")
|
||||
.map(|content| content.contains("docker") || content.contains("containerd"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
/// Time utilities
|
||||
pub mod time {
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Measure execution time of a closure
|
||||
pub fn measure_time<F, R>(f: F) -> (R, Duration)
|
||||
where
|
||||
F: FnOnce() -> R,
|
||||
{
|
||||
let start = Instant::now();
|
||||
let result = f();
|
||||
let duration = start.elapsed();
|
||||
(result, duration)
|
||||
}
|
||||
}
|
||||
|
||||
/// Performance monitoring utilities
|
||||
pub mod perf {
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::warn;
|
||||
|
||||
/// Performance monitor for critical operations
|
||||
pub struct PerfMonitor {
|
||||
operation: String,
|
||||
start: Instant,
|
||||
warning_threshold: Duration,
|
||||
}
|
||||
|
||||
impl PerfMonitor {
|
||||
pub fn new(operation: &str, warning_threshold: Duration) -> Self {
|
||||
Self {
|
||||
operation: operation.to_string(),
|
||||
start: Instant::now(),
|
||||
warning_threshold,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_ms(operation: &str, warning_threshold_ms: u64) -> Self {
|
||||
Self::new(operation, Duration::from_millis(warning_threshold_ms))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PerfMonitor {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
if elapsed > self.warning_threshold {
|
||||
warn!(
|
||||
"Performance warning: {} took {:?} (threshold: {:?})",
|
||||
self.operation, elapsed, self.warning_threshold
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.0"
|
||||
version = "0.1.184"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
@@ -18,4 +18,5 @@ tracing-subscriber = { workspace = true }
|
||||
ratatui = { workspace = true }
|
||||
crossterm = { workspace = true }
|
||||
toml = { workspace = true }
|
||||
gethostname = { workspace = true }
|
||||
gethostname = { workspace = true }
|
||||
wake-on-lan = "0.2"
|
||||
@@ -1,43 +1,53 @@
|
||||
use anyhow::Result;
|
||||
use crossterm::{
|
||||
event::{self, Event, KeyCode},
|
||||
event::{self},
|
||||
execute,
|
||||
terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
|
||||
};
|
||||
use ratatui::{
|
||||
backend::CrosstermBackend,
|
||||
Terminal,
|
||||
};
|
||||
use ratatui::{backend::CrosstermBackend, Terminal};
|
||||
use std::io;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{info, error, debug, warn};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::communication::{ZmqConsumer};
|
||||
use crate::config::DashboardConfig;
|
||||
use crate::communication::{ZmqConsumer, ZmqCommandSender, AgentCommand};
|
||||
use crate::metrics::MetricStore;
|
||||
use crate::ui::TuiApp;
|
||||
|
||||
pub struct Dashboard {
|
||||
zmq_consumer: ZmqConsumer,
|
||||
zmq_command_sender: ZmqCommandSender,
|
||||
metric_store: MetricStore,
|
||||
tui_app: Option<TuiApp>,
|
||||
terminal: Option<Terminal<CrosstermBackend<io::Stdout>>>,
|
||||
headless: bool,
|
||||
initial_commands_sent: std::collections::HashSet<String>,
|
||||
config: DashboardConfig,
|
||||
}
|
||||
|
||||
impl Dashboard {
|
||||
pub async fn new(config_path: Option<String>, headless: bool) -> Result<Self> {
|
||||
info!("Initializing dashboard");
|
||||
|
||||
// Load configuration
|
||||
let config = if let Some(path) = config_path {
|
||||
DashboardConfig::load_from_file(&path)?
|
||||
} else {
|
||||
DashboardConfig::default()
|
||||
|
||||
// Load configuration - try default path if not specified
|
||||
let config = match config_path {
|
||||
Some(path) => DashboardConfig::load_from_file(&path)?,
|
||||
None => {
|
||||
// Try default NixOS config path
|
||||
let default_path = "/etc/cm-dashboard/dashboard.toml";
|
||||
match DashboardConfig::load_from_file(default_path) {
|
||||
Ok(config) => {
|
||||
info!("Using default config file: {}", default_path);
|
||||
config
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Configuration file is required. Use --config to specify path or ensure {} exists.", default_path);
|
||||
error!("Failed to load default config: {}", e);
|
||||
return Err(anyhow::anyhow!("Missing required configuration file"));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Initialize ZMQ consumer
|
||||
let mut zmq_consumer = match ZmqConsumer::new(&config.zmq).await {
|
||||
Ok(consumer) => consumer,
|
||||
@@ -46,54 +56,48 @@ impl Dashboard {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize ZMQ command sender
|
||||
let zmq_command_sender = match ZmqCommandSender::new(&config.zmq) {
|
||||
Ok(sender) => sender,
|
||||
Err(e) => {
|
||||
error!("Failed to initialize ZMQ command sender: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
// Connect to predefined hosts from configuration
|
||||
let hosts = config.hosts.predefined_hosts.clone();
|
||||
|
||||
|
||||
|
||||
// Try to connect to hosts but don't fail if none are available
|
||||
match zmq_consumer.connect_to_predefined_hosts(&hosts).await {
|
||||
match zmq_consumer.connect_to_predefined_hosts(&config.hosts).await {
|
||||
Ok(_) => info!("Successfully connected to ZMQ hosts"),
|
||||
Err(e) => {
|
||||
warn!("Failed to connect to hosts (this is normal if no agents are running): {}", e);
|
||||
warn!(
|
||||
"Failed to connect to hosts (this is normal if no agents are running): {}",
|
||||
e
|
||||
);
|
||||
info!("Dashboard will start anyway and connect when agents become available");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Initialize metric store
|
||||
let metric_store = MetricStore::new(10000, 24); // 10k metrics, 24h retention
|
||||
|
||||
|
||||
// Initialize TUI components only if not headless
|
||||
let (tui_app, terminal) = if headless {
|
||||
info!("Running in headless mode (no TUI)");
|
||||
(None, None)
|
||||
} else {
|
||||
// Initialize TUI app
|
||||
let tui_app = TuiApp::new();
|
||||
|
||||
let tui_app = TuiApp::new(config.clone());
|
||||
|
||||
// Setup terminal
|
||||
if let Err(e) = enable_raw_mode() {
|
||||
error!("Failed to enable raw mode: {}", e);
|
||||
error!("This usually means the dashboard is being run without a proper terminal (TTY)");
|
||||
error!(
|
||||
"This usually means the dashboard is being run without a proper terminal (TTY)"
|
||||
);
|
||||
error!("Try running with --headless flag or in a proper terminal");
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
|
||||
let mut stdout = io::stdout();
|
||||
if let Err(e) = execute!(stdout, EnterAlternateScreen) {
|
||||
error!("Failed to enter alternate screen: {}", e);
|
||||
let _ = disable_raw_mode();
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
|
||||
let backend = CrosstermBackend::new(stdout);
|
||||
let terminal = match Terminal::new(backend) {
|
||||
Ok(term) => term,
|
||||
@@ -103,83 +107,55 @@ impl Dashboard {
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
(Some(tui_app), Some(terminal))
|
||||
};
|
||||
|
||||
|
||||
info!("Dashboard initialization complete");
|
||||
|
||||
|
||||
Ok(Self {
|
||||
zmq_consumer,
|
||||
zmq_command_sender,
|
||||
metric_store,
|
||||
tui_app,
|
||||
terminal,
|
||||
headless,
|
||||
initial_commands_sent: std::collections::HashSet::new(),
|
||||
config,
|
||||
})
|
||||
}
|
||||
|
||||
/// Send a command to a specific agent
|
||||
pub async fn send_command(&mut self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||
self.zmq_command_sender.send_command(hostname, command).await
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
pub async fn run(&mut self) -> Result<()> {
|
||||
info!("Starting dashboard main loop");
|
||||
|
||||
|
||||
let mut last_metrics_check = Instant::now();
|
||||
let metrics_check_interval = Duration::from_millis(100); // Check for metrics every 100ms
|
||||
|
||||
let mut last_heartbeat_check = Instant::now();
|
||||
let heartbeat_check_interval = Duration::from_secs(1); // Check for host connectivity every 1 second
|
||||
|
||||
loop {
|
||||
// Handle terminal events (keyboard input) only if not headless
|
||||
if !self.headless {
|
||||
match event::poll(Duration::from_millis(50)) {
|
||||
Ok(true) => {
|
||||
match event::read() {
|
||||
Ok(Event::Key(key)) => {
|
||||
match key.code {
|
||||
KeyCode::Char('q') => {
|
||||
info!("Quit key pressed, exiting dashboard");
|
||||
break;
|
||||
}
|
||||
KeyCode::Left => {
|
||||
debug!("Navigate left");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling left navigation: {}", e);
|
||||
Ok(event) => {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
// Handle input
|
||||
match tui_app.handle_input(event) {
|
||||
Ok(_) => {
|
||||
// Check if we should quit
|
||||
if tui_app.should_quit() {
|
||||
info!("Quit requested, exiting dashboard");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Right => {
|
||||
debug!("Navigate right");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling right navigation: {}", e);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Error handling input: {}", e);
|
||||
}
|
||||
}
|
||||
KeyCode::Char('r') => {
|
||||
debug!("Refresh requested");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling refresh: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Tab => {
|
||||
debug!("Tab pressed - next host");
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = tui_app.handle_input(Event::Key(key)) {
|
||||
error!("Error handling tab navigation: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(_) => {} // Other events (mouse, resize, etc.)
|
||||
Err(e) => {
|
||||
error!("Error reading terminal event: {}", e);
|
||||
break;
|
||||
@@ -192,61 +168,108 @@ impl Dashboard {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for new metrics
|
||||
if last_metrics_check.elapsed() >= metrics_check_interval {
|
||||
if let Ok(Some(metric_message)) = self.zmq_consumer.receive_metrics().await {
|
||||
debug!("Received metrics from {}: {} metrics",
|
||||
metric_message.hostname, metric_message.metrics.len());
|
||||
|
||||
// Check if this is the first time we've seen this host
|
||||
let is_new_host = !self.initial_commands_sent.contains(&metric_message.hostname);
|
||||
|
||||
if is_new_host {
|
||||
info!("First contact with host {}, sending initial CollectNow command", metric_message.hostname);
|
||||
|
||||
// Send CollectNow command for immediate refresh
|
||||
if let Err(e) = self.send_command(&metric_message.hostname, AgentCommand::CollectNow).await {
|
||||
error!("Failed to send initial CollectNow command to {}: {}", metric_message.hostname, e);
|
||||
} else {
|
||||
info!("✓ Sent initial CollectNow command to {}", metric_message.hostname);
|
||||
self.initial_commands_sent.insert(metric_message.hostname.clone());
|
||||
|
||||
// Render UI immediately after handling input for responsive feedback
|
||||
if let Some(ref mut terminal) = self.terminal {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = terminal.draw(|frame| {
|
||||
tui_app.render(frame, &self.metric_store);
|
||||
}) {
|
||||
error!("Error rendering TUI after input: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Update metric store
|
||||
self.metric_store.update_metrics(&metric_message.hostname, metric_message.metrics);
|
||||
|
||||
// Update TUI with new hosts and metrics (only if not headless)
|
||||
}
|
||||
}
|
||||
|
||||
// Check for new metrics
|
||||
if last_metrics_check.elapsed() >= metrics_check_interval {
|
||||
if let Ok(Some(agent_data)) = self.zmq_consumer.receive_agent_data().await {
|
||||
debug!(
|
||||
"Received agent data from {}",
|
||||
agent_data.hostname
|
||||
);
|
||||
|
||||
// Track first contact with host (no command needed - agent sends data every 2s)
|
||||
let is_new_host = !self
|
||||
.initial_commands_sent
|
||||
.contains(&agent_data.hostname);
|
||||
|
||||
if is_new_host {
|
||||
info!(
|
||||
"First contact with host {} - data will update automatically",
|
||||
agent_data.hostname
|
||||
);
|
||||
self.initial_commands_sent
|
||||
.insert(agent_data.hostname.clone());
|
||||
}
|
||||
|
||||
// Store structured data directly
|
||||
self.metric_store.store_agent_data(agent_data);
|
||||
|
||||
// Check for agent version mismatches across hosts
|
||||
if let Some((current_version, outdated_hosts)) = self.metric_store.get_version_mismatches() {
|
||||
for outdated_host in &outdated_hosts {
|
||||
warn!("Host {} has outdated agent version (current: {})", outdated_host, current_version);
|
||||
}
|
||||
}
|
||||
|
||||
// Update TUI with new metrics (only if not headless)
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let connected_hosts = self.metric_store.get_connected_hosts(Duration::from_secs(30));
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
tui_app.update_metrics(&self.metric_store);
|
||||
}
|
||||
}
|
||||
|
||||
// Also check for command output messages
|
||||
if let Ok(Some(cmd_output)) = self.zmq_consumer.receive_command_output().await {
|
||||
debug!(
|
||||
"Received command output from {}: {}",
|
||||
cmd_output.hostname,
|
||||
cmd_output.output_line
|
||||
);
|
||||
|
||||
// Command output (terminal popup removed - output not displayed)
|
||||
}
|
||||
|
||||
last_metrics_check = Instant::now();
|
||||
}
|
||||
|
||||
|
||||
// Check for host connectivity changes (heartbeat timeouts) periodically
|
||||
if last_heartbeat_check.elapsed() >= heartbeat_check_interval {
|
||||
let timeout = Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds);
|
||||
|
||||
// Clean up metrics for offline hosts
|
||||
self.metric_store.cleanup_offline_hosts(timeout);
|
||||
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let connected_hosts = self.metric_store.get_connected_hosts(timeout);
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
}
|
||||
last_heartbeat_check = Instant::now();
|
||||
}
|
||||
|
||||
// Render TUI (only if not headless)
|
||||
if !self.headless {
|
||||
if let (Some(ref mut terminal), Some(ref mut tui_app)) = (&mut self.terminal, &mut self.tui_app) {
|
||||
if let Err(e) = terminal.draw(|frame| {
|
||||
tui_app.render(frame, &self.metric_store);
|
||||
}) {
|
||||
error!("Error rendering TUI: {}", e);
|
||||
break;
|
||||
if let Some(ref mut terminal) = self.terminal {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
if let Err(e) = terminal.draw(|frame| {
|
||||
tui_app.render(frame, &self.metric_store);
|
||||
}) {
|
||||
error!("Error rendering TUI: {}", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Small sleep to prevent excessive CPU usage
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
|
||||
info!("Dashboard main loop ended");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
impl Drop for Dashboard {
|
||||
@@ -255,12 +278,9 @@ impl Drop for Dashboard {
|
||||
if !self.headless {
|
||||
let _ = disable_raw_mode();
|
||||
if let Some(ref mut terminal) = self.terminal {
|
||||
let _ = execute!(
|
||||
terminal.backend_mut(),
|
||||
LeaveAlternateScreen
|
||||
);
|
||||
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
|
||||
let _ = terminal.show_cursor();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,22 +1,10 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::{MetricMessage, MessageEnvelope, MessageType};
|
||||
use tracing::{info, error, debug, warn};
|
||||
use cm_dashboard_shared::{AgentData, CommandOutputMessage, MessageEnvelope, MessageType};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use zmq::{Context, Socket, SocketType};
|
||||
|
||||
use crate::config::ZmqConfig;
|
||||
|
||||
/// Commands that can be sent to agents
|
||||
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||
pub enum AgentCommand {
|
||||
/// Request immediate metric collection
|
||||
CollectNow,
|
||||
/// Change collection interval
|
||||
SetInterval { seconds: u64 },
|
||||
/// Enable/disable a collector
|
||||
ToggleCollector { name: String, enabled: bool },
|
||||
/// Request status/health check
|
||||
Ping,
|
||||
}
|
||||
|
||||
/// ZMQ consumer for receiving metrics from agents
|
||||
pub struct ZmqConsumer {
|
||||
@@ -28,27 +16,27 @@ pub struct ZmqConsumer {
|
||||
impl ZmqConsumer {
|
||||
pub async fn new(config: &ZmqConfig) -> Result<Self> {
|
||||
let context = Context::new();
|
||||
|
||||
|
||||
// Create subscriber socket
|
||||
let subscriber = context.socket(SocketType::SUB)?;
|
||||
|
||||
|
||||
// Set socket options
|
||||
subscriber.set_rcvtimeo(1000)?; // 1 second timeout for non-blocking receives
|
||||
subscriber.set_subscribe(b"")?; // Subscribe to all messages
|
||||
|
||||
|
||||
info!("ZMQ consumer initialized");
|
||||
|
||||
|
||||
Ok(Self {
|
||||
subscriber,
|
||||
config: config.clone(),
|
||||
connected_hosts: std::collections::HashSet::new(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Connect to a specific host's agent
|
||||
pub async fn connect_to_host(&mut self, hostname: &str, port: u16) -> Result<()> {
|
||||
let address = format!("tcp://{}:{}", hostname, port);
|
||||
|
||||
|
||||
match self.subscriber.connect(&address) {
|
||||
Ok(()) => {
|
||||
info!("Connected to agent at {}", address);
|
||||
@@ -61,52 +49,107 @@ impl ZmqConsumer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to predefined hosts
|
||||
pub async fn connect_to_predefined_hosts(&mut self, hosts: &[String]) -> Result<()> {
|
||||
|
||||
|
||||
/// Connect to predefined hosts using their configuration
|
||||
pub async fn connect_to_predefined_hosts(&mut self, hosts: &std::collections::HashMap<String, crate::config::HostDetails>) -> Result<()> {
|
||||
let default_port = self.config.subscriber_ports[0];
|
||||
|
||||
for hostname in hosts {
|
||||
// Try to connect, but don't fail if some hosts are unreachable
|
||||
if let Err(e) = self.connect_to_host(hostname, default_port).await {
|
||||
|
||||
for (hostname, host_details) in hosts {
|
||||
// Try to connect using configured IP, but don't fail if some hosts are unreachable
|
||||
if let Err(e) = self.connect_to_host_with_details(hostname, host_details, default_port).await {
|
||||
warn!("Could not connect to {}: {}", hostname, e);
|
||||
}
|
||||
}
|
||||
|
||||
info!("Connected to {} out of {} configured hosts",
|
||||
self.connected_hosts.len(), hosts.len());
|
||||
|
||||
|
||||
info!(
|
||||
"Connected to {} out of {} configured hosts",
|
||||
self.connected_hosts.len(),
|
||||
hosts.len()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Receive metrics from any connected agent (non-blocking)
|
||||
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
||||
|
||||
/// Connect to a host using its configuration details
|
||||
pub async fn connect_to_host_with_details(&mut self, hostname: &str, host_details: &crate::config::HostDetails, port: u16) -> Result<()> {
|
||||
// Get primary connection IP only - no fallbacks
|
||||
let primary_ip = host_details.get_connection_ip(hostname);
|
||||
|
||||
// Connect directly without fallback attempts
|
||||
self.connect_to_host(&primary_ip, port).await
|
||||
}
|
||||
|
||||
/// Receive command output from any connected agent (non-blocking)
|
||||
pub async fn receive_command_output(&mut self) -> Result<Option<CommandOutputMessage>> {
|
||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||
Ok(data) => {
|
||||
debug!("Received {} bytes from ZMQ", data.len());
|
||||
|
||||
// Deserialize envelope
|
||||
let envelope: MessageEnvelope = serde_json::from_slice(&data)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to deserialize envelope: {}", e))?;
|
||||
|
||||
|
||||
// Check message type
|
||||
match envelope.message_type {
|
||||
MessageType::Metrics => {
|
||||
let metrics = envelope.decode_metrics()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to decode metrics: {}", e))?;
|
||||
|
||||
debug!("Received {} metrics from {}",
|
||||
metrics.metrics.len(), metrics.hostname);
|
||||
|
||||
Ok(Some(metrics))
|
||||
MessageType::CommandOutput => {
|
||||
let cmd_output = envelope
|
||||
.decode_command_output()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to decode command output: {}", e))?;
|
||||
|
||||
debug!(
|
||||
"Received command output from {}: {}",
|
||||
cmd_output.hostname,
|
||||
cmd_output.output_line
|
||||
);
|
||||
|
||||
Ok(Some(cmd_output))
|
||||
}
|
||||
_ => Ok(None), // Not a command output message
|
||||
}
|
||||
}
|
||||
Err(zmq::Error::EAGAIN) => {
|
||||
// No message available (non-blocking mode)
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("ZMQ receive error: {}", e);
|
||||
Err(anyhow::anyhow!("ZMQ receive error: {}", e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Receive agent data (non-blocking)
|
||||
pub async fn receive_agent_data(&mut self) -> Result<Option<AgentData>> {
|
||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||
Ok(data) => {
|
||||
debug!("Received {} bytes from ZMQ", data.len());
|
||||
|
||||
// Deserialize envelope
|
||||
let envelope: MessageEnvelope = serde_json::from_slice(&data)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to deserialize envelope: {}", e))?;
|
||||
|
||||
// Check message type
|
||||
match envelope.message_type {
|
||||
MessageType::AgentData => {
|
||||
let agent_data = envelope
|
||||
.decode_agent_data()
|
||||
.map_err(|e| anyhow::anyhow!("Failed to decode agent data: {}", e))?;
|
||||
|
||||
debug!(
|
||||
"Received agent data from host {}",
|
||||
agent_data.hostname
|
||||
);
|
||||
Ok(Some(agent_data))
|
||||
}
|
||||
MessageType::Heartbeat => {
|
||||
debug!("Received heartbeat");
|
||||
Ok(None) // Don't return heartbeats as metrics
|
||||
Ok(None) // Don't return heartbeats
|
||||
}
|
||||
MessageType::CommandOutput => {
|
||||
debug!("Received command output (will be handled by receive_command_output)");
|
||||
Ok(None) // Command output handled by separate method
|
||||
}
|
||||
_ => {
|
||||
debug!("Received non-metrics message: {:?}", envelope.message_type);
|
||||
debug!("Received unsupported message: {:?}", envelope.message_type);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@@ -121,47 +164,6 @@ impl ZmqConsumer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/// ZMQ command sender for sending commands to agents
|
||||
pub struct ZmqCommandSender {
|
||||
context: Context,
|
||||
}
|
||||
|
||||
impl ZmqCommandSender {
|
||||
pub fn new(_config: &ZmqConfig) -> Result<Self> {
|
||||
let context = Context::new();
|
||||
|
||||
info!("ZMQ command sender initialized");
|
||||
|
||||
Ok(Self {
|
||||
context,
|
||||
})
|
||||
}
|
||||
|
||||
/// Send a command to a specific agent
|
||||
pub async fn send_command(&self, hostname: &str, command: AgentCommand) -> Result<()> {
|
||||
// Create a new PUSH socket for this command (ZMQ best practice)
|
||||
let socket = self.context.socket(SocketType::PUSH)?;
|
||||
|
||||
// Set socket options
|
||||
socket.set_linger(1000)?; // Wait up to 1 second on close
|
||||
socket.set_sndtimeo(5000)?; // 5 second send timeout
|
||||
|
||||
// Connect to agent's command port (6131)
|
||||
let address = format!("tcp://{}:6131", hostname);
|
||||
socket.connect(&address)?;
|
||||
|
||||
// Serialize command
|
||||
let serialized = serde_json::to_vec(&command)?;
|
||||
|
||||
// Send command
|
||||
socket.send(&serialized, 0)?;
|
||||
|
||||
info!("Sent command {:?} to agent at {}", command, hostname);
|
||||
|
||||
// Socket will be automatically closed when dropped
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -6,58 +6,64 @@ use std::path::Path;
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DashboardConfig {
|
||||
pub zmq: ZmqConfig,
|
||||
pub ui: UiConfig,
|
||||
pub hosts: HostsConfig,
|
||||
pub metrics: MetricsConfig,
|
||||
pub widgets: WidgetsConfig,
|
||||
pub hosts: std::collections::HashMap<String, HostDetails>,
|
||||
pub system: SystemConfig,
|
||||
pub ssh: SshConfig,
|
||||
pub service_logs: std::collections::HashMap<String, Vec<ServiceLogConfig>>,
|
||||
}
|
||||
|
||||
/// ZMQ consumer configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ZmqConfig {
|
||||
pub subscriber_ports: Vec<u16>,
|
||||
pub connection_timeout_ms: u64,
|
||||
pub reconnect_interval_ms: u64,
|
||||
/// Heartbeat timeout in seconds - hosts considered offline if no heartbeat received within this time
|
||||
#[serde(default = "default_heartbeat_timeout_seconds")]
|
||||
pub heartbeat_timeout_seconds: u64,
|
||||
}
|
||||
|
||||
/// UI configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct UiConfig {
|
||||
pub refresh_rate_ms: u64,
|
||||
pub theme: String,
|
||||
pub preserve_layout: bool,
|
||||
fn default_heartbeat_timeout_seconds() -> u64 {
|
||||
10 // Default to 10 seconds - allows for multiple missed heartbeats
|
||||
}
|
||||
|
||||
/// Hosts configuration
|
||||
/// Individual host configuration details
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HostsConfig {
|
||||
pub auto_discovery: bool,
|
||||
pub predefined_hosts: Vec<String>,
|
||||
pub default_host: Option<String>,
|
||||
pub struct HostDetails {
|
||||
pub mac_address: Option<String>,
|
||||
/// Primary IP address (local network)
|
||||
pub ip: Option<String>,
|
||||
}
|
||||
|
||||
/// Metrics configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsConfig {
|
||||
pub history_retention_hours: u64,
|
||||
pub max_metrics_per_host: usize,
|
||||
|
||||
impl HostDetails {
|
||||
/// Get the IP address for connection (uses ip field or hostname as fallback)
|
||||
pub fn get_connection_ip(&self, hostname: &str) -> String {
|
||||
self.ip.as_ref().unwrap_or(&hostname.to_string()).clone()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Widget configuration
|
||||
/// System configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WidgetsConfig {
|
||||
pub cpu: WidgetConfig,
|
||||
pub memory: WidgetConfig,
|
||||
pub storage: WidgetConfig,
|
||||
pub services: WidgetConfig,
|
||||
pub backup: WidgetConfig,
|
||||
pub struct SystemConfig {
|
||||
pub nixos_config_git_url: String,
|
||||
pub nixos_config_branch: String,
|
||||
pub nixos_config_working_dir: String,
|
||||
pub nixos_config_api_key_file: Option<String>,
|
||||
}
|
||||
|
||||
/// Individual widget configuration
|
||||
/// SSH configuration for rebuild and backup operations
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WidgetConfig {
|
||||
pub enabled: bool,
|
||||
pub metrics: Vec<String>,
|
||||
pub struct SshConfig {
|
||||
pub rebuild_user: String,
|
||||
pub rebuild_cmd: String,
|
||||
pub service_manage_cmd: String,
|
||||
}
|
||||
|
||||
/// Service log file configuration per host
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ServiceLogConfig {
|
||||
pub service_name: String,
|
||||
pub log_file_path: String,
|
||||
}
|
||||
|
||||
impl DashboardConfig {
|
||||
@@ -71,104 +77,13 @@ impl DashboardConfig {
|
||||
|
||||
impl Default for DashboardConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
zmq: ZmqConfig::default(),
|
||||
ui: UiConfig::default(),
|
||||
hosts: HostsConfig::default(),
|
||||
metrics: MetricsConfig::default(),
|
||||
widgets: WidgetsConfig::default(),
|
||||
}
|
||||
panic!("Dashboard configuration must be loaded from file - no hardcoded defaults allowed")
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ZmqConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
subscriber_ports: vec![6130],
|
||||
connection_timeout_ms: 15000,
|
||||
reconnect_interval_ms: 5000,
|
||||
}
|
||||
panic!("Dashboard configuration must be loaded from file - no hardcoded defaults allowed")
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for UiConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
refresh_rate_ms: 100,
|
||||
theme: "default".to_string(),
|
||||
preserve_layout: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HostsConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
auto_discovery: true,
|
||||
predefined_hosts: vec![
|
||||
"cmbox".to_string(),
|
||||
"labbox".to_string(),
|
||||
"simonbox".to_string(),
|
||||
"steambox".to_string(),
|
||||
"srv01".to_string(),
|
||||
"srv02".to_string(),
|
||||
],
|
||||
default_host: Some("cmbox".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MetricsConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
history_retention_hours: 24,
|
||||
max_metrics_per_host: 10000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for WidgetsConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpu: WidgetConfig {
|
||||
enabled: true,
|
||||
metrics: vec![
|
||||
"cpu_load_1min".to_string(),
|
||||
"cpu_load_5min".to_string(),
|
||||
"cpu_load_15min".to_string(),
|
||||
"cpu_temperature_celsius".to_string(),
|
||||
],
|
||||
},
|
||||
memory: WidgetConfig {
|
||||
enabled: true,
|
||||
metrics: vec![
|
||||
"memory_usage_percent".to_string(),
|
||||
"memory_total_gb".to_string(),
|
||||
"memory_available_gb".to_string(),
|
||||
],
|
||||
},
|
||||
storage: WidgetConfig {
|
||||
enabled: true,
|
||||
metrics: vec![
|
||||
"disk_nvme0_temperature_celsius".to_string(),
|
||||
"disk_nvme0_wear_percent".to_string(),
|
||||
"disk_nvme0_usage_percent".to_string(),
|
||||
],
|
||||
},
|
||||
services: WidgetConfig {
|
||||
enabled: true,
|
||||
metrics: vec![
|
||||
"service_ssh_status".to_string(),
|
||||
"service_ssh_memory_mb".to_string(),
|
||||
],
|
||||
},
|
||||
backup: WidgetConfig {
|
||||
enabled: true,
|
||||
metrics: vec![
|
||||
"backup_status".to_string(),
|
||||
"backup_last_run_timestamp".to_string(),
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
// TODO: Implement hosts module
|
||||
@@ -1,18 +1,40 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use tracing::{info, error};
|
||||
use std::process;
|
||||
use tracing::{error, info};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
mod app;
|
||||
mod config;
|
||||
mod communication;
|
||||
mod config;
|
||||
mod metrics;
|
||||
mod ui;
|
||||
mod hosts;
|
||||
mod utils;
|
||||
|
||||
use app::Dashboard;
|
||||
|
||||
|
||||
/// Check if running inside tmux session
|
||||
fn check_tmux_session() {
|
||||
// Check for TMUX environment variable which is set when inside a tmux session
|
||||
if std::env::var("TMUX").is_err() {
|
||||
eprintln!("╭─────────────────────────────────────────────────────────────╮");
|
||||
eprintln!("│ ⚠️ TMUX REQUIRED │");
|
||||
eprintln!("├─────────────────────────────────────────────────────────────┤");
|
||||
eprintln!("│ CM Dashboard must be run inside a tmux session for proper │");
|
||||
eprintln!("│ terminal handling and remote operation functionality. │");
|
||||
eprintln!("│ │");
|
||||
eprintln!("│ Please start a tmux session first: │");
|
||||
eprintln!("│ tmux new-session -d -s dashboard cm-dashboard │");
|
||||
eprintln!("│ tmux attach-session -t dashboard │");
|
||||
eprintln!("│ │");
|
||||
eprintln!("│ Or simply: │");
|
||||
eprintln!("│ tmux │");
|
||||
eprintln!("│ cm-dashboard │");
|
||||
eprintln!("╰─────────────────────────────────────────────────────────────╯");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "cm-dashboard")]
|
||||
#[command(about = "CM Dashboard TUI with individual metric consumption")]
|
||||
@@ -21,11 +43,11 @@ struct Cli {
|
||||
/// Increase logging verbosity (-v, -vv)
|
||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||
verbose: u8,
|
||||
|
||||
/// Configuration file path
|
||||
|
||||
/// Configuration file path (defaults to /etc/cm-dashboard/dashboard.toml)
|
||||
#[arg(short, long)]
|
||||
config: Option<String>,
|
||||
|
||||
|
||||
/// Run in headless mode (no TUI, just logging)
|
||||
#[arg(long)]
|
||||
headless: bool,
|
||||
@@ -34,16 +56,16 @@ struct Cli {
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
|
||||
// Setup logging - only if headless or verbose
|
||||
if cli.headless || cli.verbose > 0 {
|
||||
let log_level = match cli.verbose {
|
||||
0 => "warn", // Only warnings and errors when not verbose
|
||||
0 => "warn", // Only warnings and errors when not verbose
|
||||
1 => "info",
|
||||
2 => "debug",
|
||||
2 => "debug",
|
||||
_ => "trace",
|
||||
};
|
||||
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
|
||||
.init();
|
||||
@@ -53,21 +75,26 @@ async fn main() -> Result<()> {
|
||||
.with_env_filter(EnvFilter::from_default_env().add_directive("off".parse()?))
|
||||
.init();
|
||||
}
|
||||
|
||||
|
||||
// Check for tmux session requirement (only for TUI mode)
|
||||
if !cli.headless {
|
||||
check_tmux_session();
|
||||
}
|
||||
|
||||
if cli.headless || cli.verbose > 0 {
|
||||
info!("CM Dashboard starting with individual metrics architecture...");
|
||||
}
|
||||
|
||||
|
||||
// Create and run dashboard
|
||||
let mut dashboard = Dashboard::new(cli.config, cli.headless).await?;
|
||||
|
||||
|
||||
// Setup graceful shutdown
|
||||
let ctrl_c = async {
|
||||
tokio::signal::ctrl_c()
|
||||
.await
|
||||
.expect("failed to install Ctrl+C handler");
|
||||
};
|
||||
|
||||
|
||||
// Run dashboard with graceful shutdown
|
||||
tokio::select! {
|
||||
result = dashboard.run() => {
|
||||
@@ -80,9 +107,9 @@ async fn main() -> Result<()> {
|
||||
info!("Shutdown signal received");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if cli.headless || cli.verbose > 0 {
|
||||
info!("Dashboard shutdown complete");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,11 +4,8 @@ pub mod store;
|
||||
|
||||
pub use store::MetricStore;
|
||||
|
||||
|
||||
/// Historical metric data point
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricDataPoint {
|
||||
pub received_at: Instant,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use cm_dashboard_shared::Metric;
|
||||
use cm_dashboard_shared::AgentData;
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, info, warn};
|
||||
@@ -7,12 +7,12 @@ use super::MetricDataPoint;
|
||||
|
||||
/// Central metric storage for the dashboard
|
||||
pub struct MetricStore {
|
||||
/// Current metrics: hostname -> metric_name -> metric
|
||||
current_metrics: HashMap<String, HashMap<String, Metric>>,
|
||||
/// Current structured data: hostname -> AgentData
|
||||
current_agent_data: HashMap<String, AgentData>,
|
||||
/// Historical metrics for trending
|
||||
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
|
||||
/// Last update timestamp per host
|
||||
last_update: HashMap<String, Instant>,
|
||||
/// Last heartbeat timestamp per host
|
||||
last_heartbeat: HashMap<String, Instant>,
|
||||
/// Configuration
|
||||
max_metrics_per_host: usize,
|
||||
history_retention: Duration,
|
||||
@@ -21,115 +21,153 @@ pub struct MetricStore {
|
||||
impl MetricStore {
|
||||
pub fn new(max_metrics_per_host: usize, history_retention_hours: u64) -> Self {
|
||||
Self {
|
||||
current_metrics: HashMap::new(),
|
||||
current_agent_data: HashMap::new(),
|
||||
historical_metrics: HashMap::new(),
|
||||
last_update: HashMap::new(),
|
||||
last_heartbeat: HashMap::new(),
|
||||
max_metrics_per_host,
|
||||
history_retention: Duration::from_secs(history_retention_hours * 3600),
|
||||
}
|
||||
}
|
||||
|
||||
/// Update metrics for a specific host
|
||||
pub fn update_metrics(&mut self, hostname: &str, metrics: Vec<Metric>) {
|
||||
|
||||
|
||||
/// Store structured agent data directly
|
||||
pub fn store_agent_data(&mut self, agent_data: AgentData) {
|
||||
let now = Instant::now();
|
||||
|
||||
debug!("Updating {} metrics for host {}", metrics.len(), hostname);
|
||||
|
||||
// Get or create host entry
|
||||
let host_metrics = self.current_metrics
|
||||
.entry(hostname.to_string())
|
||||
.or_insert_with(HashMap::new);
|
||||
|
||||
// Get or create historical entry
|
||||
let host_history = self.historical_metrics
|
||||
.entry(hostname.to_string())
|
||||
let hostname = agent_data.hostname.clone();
|
||||
|
||||
debug!("Storing structured data for host {}", hostname);
|
||||
|
||||
// Store the structured data directly
|
||||
self.current_agent_data.insert(hostname.clone(), agent_data);
|
||||
|
||||
// Update heartbeat timestamp
|
||||
self.last_heartbeat.insert(hostname.clone(), now);
|
||||
debug!("Updated heartbeat for host {}", hostname);
|
||||
|
||||
// Add to history
|
||||
let host_history = self
|
||||
.historical_metrics
|
||||
.entry(hostname.clone())
|
||||
.or_insert_with(Vec::new);
|
||||
|
||||
// Update current metrics and add to history
|
||||
for metric in metrics {
|
||||
let metric_name = metric.name.clone();
|
||||
|
||||
// Store current metric
|
||||
host_metrics.insert(metric_name.clone(), metric.clone());
|
||||
|
||||
// Add to history
|
||||
host_history.push(MetricDataPoint {
|
||||
received_at: now,
|
||||
});
|
||||
}
|
||||
|
||||
// Update last update timestamp
|
||||
self.last_update.insert(hostname.to_string(), now);
|
||||
|
||||
// Get metrics count before cleanup
|
||||
let metrics_count = host_metrics.len();
|
||||
|
||||
// Cleanup old history and enforce limits
|
||||
self.cleanup_host_data(hostname);
|
||||
|
||||
info!("Updated metrics for {}: {} current metrics",
|
||||
hostname, metrics_count);
|
||||
host_history.push(MetricDataPoint { received_at: now });
|
||||
|
||||
// Cleanup old data
|
||||
self.cleanup_host_data(&hostname);
|
||||
|
||||
info!("Stored structured data for {}", hostname);
|
||||
}
|
||||
|
||||
/// Get current metric for a specific host
|
||||
pub fn get_metric(&self, hostname: &str, metric_name: &str) -> Option<&Metric> {
|
||||
self.current_metrics
|
||||
.get(hostname)?
|
||||
.get(metric_name)
|
||||
|
||||
|
||||
|
||||
|
||||
/// Get current structured data for a host
|
||||
pub fn get_agent_data(&self, hostname: &str) -> Option<&AgentData> {
|
||||
self.current_agent_data.get(hostname)
|
||||
}
|
||||
|
||||
/// Get all current metrics for a host
|
||||
#[allow(dead_code)]
|
||||
pub fn get_host_metrics(&self, hostname: &str) -> Option<&HashMap<String, Metric>> {
|
||||
self.current_metrics.get(hostname)
|
||||
}
|
||||
|
||||
/// Get all current metrics for a host as a vector
|
||||
pub fn get_metrics_for_host(&self, hostname: &str) -> Vec<&Metric> {
|
||||
if let Some(metrics_map) = self.current_metrics.get(hostname) {
|
||||
metrics_map.values().collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Get connected hosts (hosts with recent updates)
|
||||
|
||||
|
||||
/// Get connected hosts (hosts with recent heartbeats)
|
||||
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
||||
let now = Instant::now();
|
||||
|
||||
self.last_update
|
||||
|
||||
self.last_heartbeat
|
||||
.iter()
|
||||
.filter_map(|(hostname, &last_update)| {
|
||||
if now.duration_since(last_update) <= timeout {
|
||||
.filter_map(|(hostname, &last_heartbeat)| {
|
||||
if now.duration_since(last_heartbeat) <= timeout {
|
||||
Some(hostname.clone())
|
||||
} else {
|
||||
debug!("Host {} considered offline - last heartbeat was {:?} ago",
|
||||
hostname, now.duration_since(last_heartbeat));
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Clean up data for offline hosts
|
||||
pub fn cleanup_offline_hosts(&mut self, timeout: Duration) {
|
||||
let now = Instant::now();
|
||||
let mut hosts_to_cleanup = Vec::new();
|
||||
|
||||
// Find hosts that are offline (no recent heartbeat)
|
||||
for (hostname, &last_heartbeat) in &self.last_heartbeat {
|
||||
if now.duration_since(last_heartbeat) > timeout {
|
||||
hosts_to_cleanup.push(hostname.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Clear data for offline hosts
|
||||
for hostname in hosts_to_cleanup {
|
||||
if let Some(_agent_data) = self.current_agent_data.remove(&hostname) {
|
||||
info!("Cleared structured data for offline host: {}", hostname);
|
||||
}
|
||||
// Keep heartbeat timestamp for reconnection detection
|
||||
// Don't remove from last_heartbeat to track when host was last seen
|
||||
}
|
||||
}
|
||||
|
||||
/// Cleanup old data and enforce limits
|
||||
fn cleanup_host_data(&mut self, hostname: &str) {
|
||||
let now = Instant::now();
|
||||
|
||||
|
||||
// Cleanup historical data
|
||||
if let Some(history) = self.historical_metrics.get_mut(hostname) {
|
||||
// Remove old entries
|
||||
history.retain(|dp| now.duration_since(dp.received_at) <= self.history_retention);
|
||||
|
||||
|
||||
// Enforce size limit
|
||||
if history.len() > self.max_metrics_per_host {
|
||||
let excess = history.len() - self.max_metrics_per_host;
|
||||
history.drain(0..excess);
|
||||
warn!("Trimmed {} old metrics for host {} (size limit: {})",
|
||||
excess, hostname, self.max_metrics_per_host);
|
||||
warn!(
|
||||
"Trimmed {} old metrics for host {} (size limit: {})",
|
||||
excess, hostname, self.max_metrics_per_host
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Get agent versions from all hosts for cross-host comparison
|
||||
pub fn get_agent_versions(&self) -> HashMap<String, String> {
|
||||
let mut versions = HashMap::new();
|
||||
|
||||
for (hostname, agent_data) in &self.current_agent_data {
|
||||
versions.insert(hostname.clone(), agent_data.agent_version.clone());
|
||||
}
|
||||
|
||||
versions
|
||||
}
|
||||
|
||||
/// Check for agent version mismatches across hosts
|
||||
pub fn get_version_mismatches(&self) -> Option<(String, Vec<String>)> {
|
||||
let versions = self.get_agent_versions();
|
||||
|
||||
if versions.len() < 2 {
|
||||
return None; // Need at least 2 hosts to compare
|
||||
}
|
||||
|
||||
// Find the most common version (assume it's the "current" version)
|
||||
let mut version_counts = HashMap::new();
|
||||
for version in versions.values() {
|
||||
*version_counts.entry(version.clone()).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
let most_common_version = version_counts
|
||||
.iter()
|
||||
.max_by_key(|(_, count)| *count)
|
||||
.map(|(version, _)| version.clone())?;
|
||||
|
||||
// Find hosts with different versions
|
||||
let outdated_hosts: Vec<String> = versions
|
||||
.iter()
|
||||
.filter(|(_, version)| *version != &most_common_version)
|
||||
.map(|(hostname, _)| hostname.clone())
|
||||
.collect();
|
||||
|
||||
if outdated_hosts.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some((most_common_version, outdated_hosts))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,26 +9,29 @@ use ratatui::{
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
use tracing::info;
|
||||
use wake_on_lan::MagicPacket;
|
||||
|
||||
pub mod theme;
|
||||
pub mod widgets;
|
||||
|
||||
use crate::config::DashboardConfig;
|
||||
use crate::metrics::MetricStore;
|
||||
use cm_dashboard_shared::{Metric, Status};
|
||||
use theme::{Components, Layout as ThemeLayout, StatusIcons, Theme, Typography};
|
||||
use widgets::{BackupWidget, CpuWidget, MemoryWidget, ServicesWidget, Widget};
|
||||
use cm_dashboard_shared::Status;
|
||||
use theme::{Components, Layout as ThemeLayout, Theme, Typography};
|
||||
use widgets::{ServicesWidget, SystemWidget, Widget};
|
||||
|
||||
|
||||
|
||||
|
||||
/// Panel types for focus management
|
||||
|
||||
/// Widget states for a specific host
|
||||
#[derive(Clone)]
|
||||
pub struct HostWidgets {
|
||||
/// CPU widget state
|
||||
pub cpu_widget: CpuWidget,
|
||||
/// Memory widget state
|
||||
pub memory_widget: MemoryWidget,
|
||||
/// System widget state (includes CPU, Memory, NixOS info, Storage)
|
||||
pub system_widget: SystemWidget,
|
||||
/// Services widget state
|
||||
pub services_widget: ServicesWidget,
|
||||
/// Backup widget state
|
||||
pub backup_widget: BackupWidget,
|
||||
/// Last update time for this host
|
||||
pub last_update: Option<Instant>,
|
||||
}
|
||||
@@ -36,15 +39,14 @@ pub struct HostWidgets {
|
||||
impl HostWidgets {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
cpu_widget: CpuWidget::new(),
|
||||
memory_widget: MemoryWidget::new(),
|
||||
system_widget: SystemWidget::new(),
|
||||
services_widget: ServicesWidget::new(),
|
||||
backup_widget: BackupWidget::new(),
|
||||
last_update: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Main TUI application
|
||||
pub struct TuiApp {
|
||||
/// Widget states per host (hostname -> HostWidgets)
|
||||
@@ -59,18 +61,35 @@ pub struct TuiApp {
|
||||
should_quit: bool,
|
||||
/// Track if user manually navigated away from localhost
|
||||
user_navigated_away: bool,
|
||||
/// Dashboard configuration
|
||||
config: DashboardConfig,
|
||||
/// Cached localhost hostname to avoid repeated system calls
|
||||
localhost: String,
|
||||
}
|
||||
|
||||
impl TuiApp {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
pub fn new(config: DashboardConfig) -> Self {
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
let mut app = Self {
|
||||
host_widgets: HashMap::new(),
|
||||
current_host: None,
|
||||
available_hosts: Vec::new(),
|
||||
available_hosts: config.hosts.keys().cloned().collect(),
|
||||
host_index: 0,
|
||||
should_quit: false,
|
||||
user_navigated_away: false,
|
||||
config,
|
||||
localhost,
|
||||
};
|
||||
|
||||
// Sort predefined hosts
|
||||
app.available_hosts.sort();
|
||||
|
||||
// Initialize with first host if available
|
||||
if !app.available_hosts.is_empty() {
|
||||
app.current_host = Some(app.available_hosts[0].clone());
|
||||
}
|
||||
|
||||
app
|
||||
}
|
||||
|
||||
/// Get or create host widgets for the given hostname
|
||||
@@ -80,51 +99,16 @@ impl TuiApp {
|
||||
.or_insert_with(HostWidgets::new)
|
||||
}
|
||||
|
||||
/// Update widgets with metrics from store (only for current host)
|
||||
/// Update widgets with structured data from store (only for current host)
|
||||
pub fn update_metrics(&mut self, metric_store: &MetricStore) {
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
// Only update widgets if we have metrics for this host
|
||||
let all_metrics = metric_store.get_metrics_for_host(&hostname);
|
||||
if !all_metrics.is_empty() {
|
||||
// Get metrics first while hostname is borrowed
|
||||
let cpu_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| {
|
||||
m.name.starts_with("cpu_")
|
||||
|| m.name.contains("c_state_")
|
||||
|| m.name.starts_with("process_top_")
|
||||
})
|
||||
.copied()
|
||||
.collect();
|
||||
let memory_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("memory_") || m.name.starts_with("disk_tmp_"))
|
||||
.copied()
|
||||
.collect();
|
||||
let service_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("service_"))
|
||||
.copied()
|
||||
.collect();
|
||||
let all_backup_metrics: Vec<&Metric> = all_metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("backup_"))
|
||||
.copied()
|
||||
.collect();
|
||||
|
||||
// Now get host widgets and update them
|
||||
// Get structured data for this host
|
||||
if let Some(agent_data) = metric_store.get_agent_data(&hostname) {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
|
||||
host_widgets.cpu_widget.update_from_metrics(&cpu_metrics);
|
||||
host_widgets
|
||||
.memory_widget
|
||||
.update_from_metrics(&memory_metrics);
|
||||
host_widgets
|
||||
.services_widget
|
||||
.update_from_metrics(&service_metrics);
|
||||
host_widgets
|
||||
.backup_widget
|
||||
.update_from_metrics(&all_backup_metrics);
|
||||
// Update all widgets with structured data directly
|
||||
host_widgets.system_widget.update_from_agent_data(agent_data);
|
||||
host_widgets.services_widget.update_from_agent_data(agent_data);
|
||||
|
||||
host_widgets.last_update = Some(Instant::now());
|
||||
}
|
||||
@@ -132,22 +116,28 @@ impl TuiApp {
|
||||
}
|
||||
|
||||
/// Update available hosts with localhost prioritization
|
||||
pub fn update_hosts(&mut self, hosts: Vec<String>) {
|
||||
// Sort hosts alphabetically
|
||||
let mut sorted_hosts = hosts.clone();
|
||||
sorted_hosts.sort();
|
||||
self.available_hosts = sorted_hosts;
|
||||
pub fn update_hosts(&mut self, discovered_hosts: Vec<String>) {
|
||||
// Start with configured hosts (always visible)
|
||||
let mut all_hosts: Vec<String> = self.config.hosts.keys().cloned().collect();
|
||||
|
||||
// Add any discovered hosts that aren't already configured
|
||||
for host in discovered_hosts {
|
||||
if !all_hosts.contains(&host) {
|
||||
all_hosts.push(host);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
all_hosts.sort();
|
||||
self.available_hosts = all_hosts;
|
||||
|
||||
// Get the current hostname (localhost) for auto-selection
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
|
||||
// Prioritize localhost if it becomes available, but respect user navigation
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
if !self.available_hosts.is_empty() {
|
||||
if self.available_hosts.contains(&localhost) && !self.user_navigated_away {
|
||||
if self.available_hosts.contains(&self.localhost) && !self.user_navigated_away {
|
||||
// Localhost is available and user hasn't navigated away - switch to it
|
||||
self.current_host = Some(localhost);
|
||||
self.host_index = 0; // localhost is always first in sorted_hosts
|
||||
self.current_host = Some(self.localhost.clone());
|
||||
// Find the actual index of localhost in the sorted list
|
||||
self.host_index = self.available_hosts.iter().position(|h| h == &self.localhost).unwrap_or(0);
|
||||
} else if self.current_host.is_none() {
|
||||
// No current host - select first available (which is localhost if available)
|
||||
self.current_host = Some(self.available_hosts[0].clone());
|
||||
@@ -180,11 +170,192 @@ impl TuiApp {
|
||||
self.navigate_host(1);
|
||||
}
|
||||
KeyCode::Char('r') => {
|
||||
info!("Manual refresh requested");
|
||||
// Refresh will be handled by main loop
|
||||
// System rebuild command - works on any panel for current host
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
// Create command that shows logo, rebuilds, and waits for user input
|
||||
let logo_and_rebuild = format!(
|
||||
"echo 'Rebuilding system: {} ({})' && ssh -tt {}@{} \"bash -ic '{}'\"",
|
||||
hostname,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_cmd
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&logo_and_rebuild)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('B') => {
|
||||
// Backup command - works on any panel for current host
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
// Create command that shows logo, runs backup, and waits for user input
|
||||
let logo_and_backup = format!(
|
||||
"echo 'Running backup: {} ({})' && ssh -tt {}@{} \"bash -ic '{}'\"",
|
||||
hostname,
|
||||
connection_ip,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
format!("{} start borgbackup", self.config.ssh.service_manage_cmd)
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&logo_and_backup)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('s') => {
|
||||
// Service start command via SSH with progress display
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let service_start_command = format!(
|
||||
"echo 'Starting service: {} on {}' && ssh -tt {}@{} \"bash -ic '{} start {}'\"",
|
||||
service_name,
|
||||
hostname,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
self.config.ssh.service_manage_cmd,
|
||||
service_name
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&service_start_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('S') => {
|
||||
// Service stop command via SSH with progress display
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let service_stop_command = format!(
|
||||
"echo 'Stopping service: {} on {}' && ssh -tt {}@{} \"bash -ic '{} stop {}'\"",
|
||||
service_name,
|
||||
hostname,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
self.config.ssh.service_manage_cmd,
|
||||
service_name
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&service_stop_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('L') => {
|
||||
// Show service logs via service-manage script in tmux split window
|
||||
if let (Some(service_name), Some(hostname)) = (self.get_selected_service(), self.current_host.clone()) {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let logs_command = format!(
|
||||
"ssh -tt {}@{} '{} logs {}'",
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip,
|
||||
self.config.ssh.service_manage_cmd,
|
||||
service_name
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30")
|
||||
.arg(&logs_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Char('w') => {
|
||||
// Wake on LAN for offline hosts
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
// Check if host has MAC address configured
|
||||
if let Some(host_details) = self.config.hosts.get(&hostname) {
|
||||
if let Some(mac_address) = &host_details.mac_address {
|
||||
// Parse MAC address and send WoL packet
|
||||
let mac_bytes = Self::parse_mac_address(mac_address);
|
||||
match mac_bytes {
|
||||
Ok(mac) => {
|
||||
match MagicPacket::new(&mac).send() {
|
||||
Ok(_) => {
|
||||
info!("WakeOnLAN packet sent successfully to {} ({})", hostname, mac_address);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Failed to send WakeOnLAN packet to {}: {}", hostname, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::error!("Invalid MAC address format for {}: {}", hostname, mac_address);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
KeyCode::Char('t') => {
|
||||
// Open SSH terminal session in tmux window
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let connection_ip = self.get_connection_ip(&hostname);
|
||||
let ssh_command = format!(
|
||||
"echo 'Opening SSH terminal to: {}' && ssh -tt {}@{}",
|
||||
hostname,
|
||||
self.config.ssh.rebuild_user,
|
||||
connection_ip
|
||||
);
|
||||
|
||||
std::process::Command::new("tmux")
|
||||
.arg("split-window")
|
||||
.arg("-v")
|
||||
.arg("-p")
|
||||
.arg("30") // Use 30% like other commands
|
||||
.arg(&ssh_command)
|
||||
.spawn()
|
||||
.ok(); // Ignore errors, tmux will handle them
|
||||
}
|
||||
}
|
||||
KeyCode::Tab => {
|
||||
self.navigate_host(1); // Tab cycles to next host
|
||||
// Tab cycles to next host
|
||||
self.navigate_host(1);
|
||||
}
|
||||
KeyCode::Up | KeyCode::Char('k') => {
|
||||
// Move service selection up
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.services_widget.select_previous();
|
||||
}
|
||||
}
|
||||
KeyCode::Down | KeyCode::Char('j') => {
|
||||
// Move service selection down
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let total_services = {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.services_widget.get_total_services_count()
|
||||
};
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.services_widget.select_next(total_services);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -212,9 +383,8 @@ impl TuiApp {
|
||||
self.current_host = Some(self.available_hosts[self.host_index].clone());
|
||||
|
||||
// Check if user navigated away from localhost
|
||||
let localhost = gethostname::gethostname().to_string_lossy().to_string();
|
||||
if let Some(ref current) = self.current_host {
|
||||
if current != &localhost {
|
||||
if current != &self.localhost {
|
||||
self.user_navigated_away = true;
|
||||
} else {
|
||||
self.user_navigated_away = false; // User navigated back to localhost
|
||||
@@ -224,6 +394,32 @@ impl TuiApp {
|
||||
info!("Switched to host: {}", self.current_host.as_ref().unwrap());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/// Get the currently selected service name from the services widget
|
||||
fn get_selected_service(&self) -> Option<String> {
|
||||
if let Some(hostname) = &self.current_host {
|
||||
if let Some(host_widgets) = self.host_widgets.get(hostname) {
|
||||
return host_widgets.services_widget.get_selected_service();
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
/// Should quit application
|
||||
pub fn should_quit(&self) -> bool {
|
||||
self.should_quit
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/// Render the dashboard (real btop-style multi-panel layout)
|
||||
pub fn render(&mut self, frame: &mut Frame, metric_store: &MetricStore) {
|
||||
let size = frame.size();
|
||||
@@ -235,13 +431,13 @@ impl TuiApp {
|
||||
);
|
||||
|
||||
// Create real btop-style layout: multi-panel with borders
|
||||
// Top section: title bar
|
||||
// Bottom section: split into left (mem + disks) and right (CPU + processes)
|
||||
// Three-section layout: title bar, main content, statusbar
|
||||
let main_chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Length(1), // Title bar
|
||||
Constraint::Min(0), // Main content area
|
||||
Constraint::Length(1), // Statusbar
|
||||
])
|
||||
.split(size);
|
||||
|
||||
@@ -252,50 +448,47 @@ impl TuiApp {
|
||||
Constraint::Percentage(ThemeLayout::LEFT_PANEL_WIDTH), // Left side: system, backup
|
||||
Constraint::Percentage(ThemeLayout::RIGHT_PANEL_WIDTH), // Right side: services (100% height)
|
||||
])
|
||||
.split(main_chunks[1]);
|
||||
.split(main_chunks[1]); // main_chunks[1] is now the content area (between title and statusbar)
|
||||
|
||||
// Check if backup panel should be shown
|
||||
let show_backup = if let Some(hostname) = self.current_host.clone() {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.backup_widget.has_data()
|
||||
// Check if current host is offline
|
||||
let current_host_offline = if let Some(hostname) = self.current_host.clone() {
|
||||
self.calculate_host_status(&hostname, metric_store) == Status::Offline
|
||||
} else {
|
||||
false
|
||||
true // No host selected is considered offline
|
||||
};
|
||||
|
||||
// Left side: dynamic layout based on backup data availability
|
||||
let left_chunks = if show_backup {
|
||||
// Show both system and backup panels
|
||||
ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Percentage(ThemeLayout::SYSTEM_PANEL_HEIGHT), // System section
|
||||
Constraint::Percentage(ThemeLayout::BACKUP_PANEL_HEIGHT), // Backup section
|
||||
])
|
||||
.split(content_chunks[0])
|
||||
} else {
|
||||
// Show only system panel (full height)
|
||||
ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Percentage(100)]) // System section takes full height
|
||||
.split(content_chunks[0])
|
||||
};
|
||||
// If host is offline, render wake-up message instead of panels
|
||||
if current_host_offline {
|
||||
self.render_offline_host_message(frame, main_chunks[1]);
|
||||
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||
self.render_statusbar(frame, main_chunks[2]);
|
||||
return;
|
||||
}
|
||||
|
||||
// Left side: system panel only (full height)
|
||||
let left_chunks = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Percentage(100)]) // System section takes full height
|
||||
.split(content_chunks[0]);
|
||||
|
||||
// Render title bar
|
||||
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||
|
||||
// Render new panel layout
|
||||
// Render system panel
|
||||
self.render_system_panel(frame, left_chunks[0], metric_store);
|
||||
if show_backup && left_chunks.len() > 1 {
|
||||
self.render_backup_panel(frame, left_chunks[1]);
|
||||
}
|
||||
|
||||
// Render services widget for current host
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let is_focused = true; // Always show service selection
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets
|
||||
.services_widget
|
||||
.render(frame, content_chunks[1]); // Services takes full right side
|
||||
.render(frame, content_chunks[1], is_focused); // Services takes full right side
|
||||
}
|
||||
|
||||
// Render statusbar at the bottom
|
||||
self.render_statusbar(frame, main_chunks[2]); // main_chunks[2] is the statusbar area
|
||||
|
||||
}
|
||||
|
||||
/// Render btop-style minimal title with host status colors
|
||||
@@ -306,323 +499,244 @@ impl TuiApp {
|
||||
|
||||
if self.available_hosts.is_empty() {
|
||||
let title_text = "cm-dashboard • no hosts discovered";
|
||||
let title = Paragraph::new(title_text).style(Typography::title());
|
||||
let title = Paragraph::new(title_text)
|
||||
.style(Style::default().fg(Theme::background()).bg(Theme::status_color(Status::Unknown)));
|
||||
frame.render_widget(title, area);
|
||||
return;
|
||||
}
|
||||
|
||||
// Create spans for each host with status indicators
|
||||
let mut spans = vec![Span::styled("cm-dashboard • ", Typography::title())];
|
||||
// Calculate worst-case status across all hosts (excluding offline)
|
||||
let mut worst_status = Status::Ok;
|
||||
for host in &self.available_hosts {
|
||||
let host_status = self.calculate_host_status(host, metric_store);
|
||||
// Don't include offline hosts in status aggregation
|
||||
if host_status != Status::Offline {
|
||||
worst_status = Status::aggregate(&[worst_status, host_status]);
|
||||
}
|
||||
}
|
||||
|
||||
// Use the worst status color as background
|
||||
let background_color = Theme::status_color(worst_status);
|
||||
|
||||
// Split the title bar into left and right sections
|
||||
let chunks = Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([Constraint::Length(22), Constraint::Min(0)])
|
||||
.split(area);
|
||||
|
||||
// Left side: "cm-dashboard" text with version
|
||||
let title_text = format!(" cm-dashboard v{}", env!("CARGO_PKG_VERSION"));
|
||||
let left_span = Span::styled(
|
||||
&title_text,
|
||||
Style::default().fg(Theme::background()).bg(background_color).add_modifier(Modifier::BOLD)
|
||||
);
|
||||
let left_title = Paragraph::new(Line::from(vec![left_span]))
|
||||
.style(Style::default().bg(background_color));
|
||||
frame.render_widget(left_title, chunks[0]);
|
||||
|
||||
// Right side: hosts with status indicators
|
||||
let mut host_spans = Vec::new();
|
||||
|
||||
for (i, host) in self.available_hosts.iter().enumerate() {
|
||||
if i > 0 {
|
||||
spans.push(Span::styled(" ", Typography::title()));
|
||||
host_spans.push(Span::styled(
|
||||
" ",
|
||||
Style::default().fg(Theme::background()).bg(background_color)
|
||||
));
|
||||
}
|
||||
|
||||
// Calculate overall host status from metrics
|
||||
// Always show normal status icon based on metrics (no command status at host level)
|
||||
let host_status = self.calculate_host_status(host, metric_store);
|
||||
let status_icon = StatusIcons::get_icon(host_status);
|
||||
let status_color = Theme::status_color(host_status);
|
||||
|
||||
// Add status icon
|
||||
spans.push(Span::styled(
|
||||
// Add status icon with background color as foreground against status background
|
||||
host_spans.push(Span::styled(
|
||||
format!("{} ", status_icon),
|
||||
Style::default().fg(status_color),
|
||||
Style::default().fg(Theme::background()).bg(background_color),
|
||||
));
|
||||
|
||||
if Some(host) == self.current_host.as_ref() {
|
||||
// Selected host in bold bright white
|
||||
spans.push(Span::styled(
|
||||
// Selected host in bold background color against status background
|
||||
host_spans.push(Span::styled(
|
||||
host.clone(),
|
||||
Typography::title().add_modifier(Modifier::BOLD),
|
||||
Style::default()
|
||||
.fg(Theme::background())
|
||||
.bg(background_color)
|
||||
.add_modifier(Modifier::BOLD),
|
||||
));
|
||||
} else {
|
||||
// Other hosts in normal style with status color
|
||||
spans.push(Span::styled(
|
||||
// Other hosts in normal background color against status background
|
||||
host_spans.push(Span::styled(
|
||||
host.clone(),
|
||||
Style::default().fg(status_color),
|
||||
Style::default().fg(Theme::background()).bg(background_color),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let title_line = Line::from(spans);
|
||||
let title = Paragraph::new(vec![title_line]);
|
||||
// Add right padding
|
||||
host_spans.push(Span::styled(
|
||||
" ",
|
||||
Style::default().fg(Theme::background()).bg(background_color)
|
||||
));
|
||||
|
||||
frame.render_widget(title, area);
|
||||
let host_line = Line::from(host_spans);
|
||||
let host_title = Paragraph::new(vec![host_line])
|
||||
.style(Style::default().bg(background_color))
|
||||
.alignment(ratatui::layout::Alignment::Right);
|
||||
frame.render_widget(host_title, chunks[1]);
|
||||
}
|
||||
|
||||
/// Calculate overall status for a host based on its metrics
|
||||
/// Calculate overall status for a host based on its structured data
|
||||
fn calculate_host_status(&self, hostname: &str, metric_store: &MetricStore) -> Status {
|
||||
let metrics = metric_store.get_metrics_for_host(hostname);
|
||||
|
||||
if metrics.is_empty() {
|
||||
return Status::Unknown;
|
||||
// Check if we have structured data for this host
|
||||
if let Some(_agent_data) = metric_store.get_agent_data(hostname) {
|
||||
// Return OK since we have data
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Offline
|
||||
}
|
||||
|
||||
// Check if any metric is critical
|
||||
if metrics.iter().any(|m| m.status == Status::Critical) {
|
||||
return Status::Critical;
|
||||
}
|
||||
|
||||
// Check if any metric is warning
|
||||
if metrics.iter().any(|m| m.status == Status::Warning) {
|
||||
return Status::Warning;
|
||||
}
|
||||
|
||||
// Check if all metrics are ok
|
||||
if metrics.iter().all(|m| m.status == Status::Ok) {
|
||||
return Status::Ok;
|
||||
}
|
||||
|
||||
// Default to unknown if mixed statuses
|
||||
Status::Unknown
|
||||
}
|
||||
|
||||
fn render_system_panel(&mut self, frame: &mut Frame, area: Rect, metric_store: &MetricStore) {
|
||||
/// Render dynamic statusbar with context-aware shortcuts
|
||||
fn render_statusbar(&self, frame: &mut Frame, area: Rect) {
|
||||
let shortcuts = self.get_context_shortcuts();
|
||||
let statusbar_text = shortcuts.join(" • ");
|
||||
|
||||
let statusbar = Paragraph::new(statusbar_text)
|
||||
.style(Typography::secondary())
|
||||
.alignment(ratatui::layout::Alignment::Center);
|
||||
|
||||
frame.render_widget(statusbar, area);
|
||||
}
|
||||
|
||||
/// Get context-aware shortcuts based on focused panel
|
||||
fn get_context_shortcuts(&self) -> Vec<String> {
|
||||
let mut shortcuts = Vec::new();
|
||||
|
||||
// Global shortcuts
|
||||
shortcuts.push("Tab: Host".to_string());
|
||||
shortcuts.push("↑↓/jk: Select".to_string());
|
||||
shortcuts.push("r: Rebuild".to_string());
|
||||
shortcuts.push("B: Backup".to_string());
|
||||
shortcuts.push("s/S: Start/Stop".to_string());
|
||||
shortcuts.push("L: Logs".to_string());
|
||||
shortcuts.push("t: Terminal".to_string());
|
||||
shortcuts.push("w: Wake".to_string());
|
||||
|
||||
// Always show quit
|
||||
shortcuts.push("q: Quit".to_string());
|
||||
|
||||
shortcuts
|
||||
}
|
||||
|
||||
fn render_system_panel(&mut self, frame: &mut Frame, area: Rect, _metric_store: &MetricStore) {
|
||||
let system_block = Components::widget_block("system");
|
||||
let inner_area = system_block.inner(area);
|
||||
frame.render_widget(system_block, area);
|
||||
let content_chunks = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Length(ThemeLayout::CPU_SECTION_HEIGHT), // CPU section (title, load)
|
||||
Constraint::Length(ThemeLayout::MEMORY_SECTION_HEIGHT), // Memory section (title, used, /tmp)
|
||||
Constraint::Min(0), // Storage section
|
||||
])
|
||||
.split(inner_area);
|
||||
|
||||
// Get current host widgets, create if none exist
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
// Clone the config to avoid borrowing issues
|
||||
let config = self.config.clone();
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.cpu_widget.render(frame, content_chunks[0]);
|
||||
host_widgets.memory_widget.render(frame, content_chunks[1]);
|
||||
}
|
||||
self.render_storage_section(frame, content_chunks[2], metric_store);
|
||||
}
|
||||
|
||||
fn render_backup_panel(&mut self, frame: &mut Frame, area: Rect) {
|
||||
let backup_block = Components::widget_block("backup");
|
||||
let inner_area = backup_block.inner(area);
|
||||
frame.render_widget(backup_block, area);
|
||||
|
||||
// Get current host widgets for backup widget
|
||||
if let Some(hostname) = self.current_host.clone() {
|
||||
let host_widgets = self.get_or_create_host_widgets(&hostname);
|
||||
host_widgets.backup_widget.render(frame, inner_area);
|
||||
host_widgets.system_widget.render(frame, inner_area, &hostname, Some(&config));
|
||||
}
|
||||
}
|
||||
|
||||
fn render_storage_section(&self, frame: &mut Frame, area: Rect, metric_store: &MetricStore) {
|
||||
if area.height < 2 {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(ref hostname) = self.current_host {
|
||||
// Get disk count to determine how many disks to display
|
||||
let disk_count =
|
||||
if let Some(count_metric) = metric_store.get_metric(hostname, "disk_count") {
|
||||
count_metric.value.as_i64().unwrap_or(0) as usize
|
||||
} else {
|
||||
0
|
||||
};
|
||||
/// Render offline host message with wake-up option
|
||||
fn render_offline_host_message(&self, frame: &mut Frame, area: Rect) {
|
||||
use ratatui::layout::Alignment;
|
||||
use ratatui::style::Modifier;
|
||||
use ratatui::text::{Line, Span};
|
||||
use ratatui::widgets::{Block, Borders, Paragraph};
|
||||
|
||||
if disk_count == 0 {
|
||||
// No disks found - show error/waiting message
|
||||
let content_chunks = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Length(1), Constraint::Min(0)])
|
||||
.split(area);
|
||||
// Get hostname for message
|
||||
let hostname = self.current_host.as_ref()
|
||||
.map(|h| h.as_str())
|
||||
.unwrap_or("Unknown");
|
||||
|
||||
let disk_title = Paragraph::new("Storage:").style(Typography::widget_title());
|
||||
frame.render_widget(disk_title, content_chunks[0]);
|
||||
// Check if host has MAC address for wake-on-LAN
|
||||
let has_mac = self.current_host.as_ref()
|
||||
.and_then(|hostname| self.config.hosts.get(hostname))
|
||||
.and_then(|details| details.mac_address.as_ref())
|
||||
.is_some();
|
||||
|
||||
let no_disks_spans =
|
||||
StatusIcons::create_status_spans(Status::Unknown, "No mounted disks detected");
|
||||
let no_disks_para = Paragraph::new(ratatui::text::Line::from(no_disks_spans));
|
||||
frame.render_widget(no_disks_para, content_chunks[1]);
|
||||
return;
|
||||
}
|
||||
// Create message content
|
||||
let mut lines = vec![
|
||||
Line::from(Span::styled(
|
||||
format!("Host '{}' is offline", hostname),
|
||||
Style::default().fg(Theme::muted_text()).add_modifier(Modifier::BOLD),
|
||||
)),
|
||||
Line::from(""),
|
||||
];
|
||||
|
||||
// Group disks by physical device
|
||||
let mut physical_devices: std::collections::HashMap<String, Vec<usize>> =
|
||||
std::collections::HashMap::new();
|
||||
|
||||
for disk_index in 0..disk_count {
|
||||
if let Some(physical_device_metric) = metric_store
|
||||
.get_metric(hostname, &format!("disk_{}_physical_device", disk_index))
|
||||
{
|
||||
let physical_device = physical_device_metric.value.as_string();
|
||||
physical_devices
|
||||
.entry(physical_device)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(disk_index);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate how many lines we need
|
||||
let mut _total_lines_needed = 0;
|
||||
for partitions in physical_devices.values() {
|
||||
_total_lines_needed += 2 + partitions.len(); // title + health + usage_per_partition
|
||||
}
|
||||
|
||||
let available_lines = area.height as usize;
|
||||
|
||||
// Create constraints dynamically based on physical devices
|
||||
let mut constraints = Vec::new();
|
||||
let mut devices_to_show = Vec::new();
|
||||
let mut current_line = 0;
|
||||
|
||||
// Sort physical devices by name for consistent ordering
|
||||
let mut sorted_devices: Vec<_> = physical_devices.iter().collect();
|
||||
sorted_devices.sort_by_key(|(device_name, _)| device_name.as_str());
|
||||
|
||||
for (physical_device, partitions) in sorted_devices {
|
||||
let lines_for_this_device = 2 + partitions.len();
|
||||
if current_line + lines_for_this_device <= available_lines {
|
||||
devices_to_show.push((physical_device.clone(), partitions.clone()));
|
||||
|
||||
// Add constraints for this device
|
||||
constraints.push(Constraint::Length(1)); // Device title
|
||||
constraints.push(Constraint::Length(1)); // Health line
|
||||
for _ in 0..partitions.len() {
|
||||
constraints.push(Constraint::Length(1)); // Usage line per partition
|
||||
}
|
||||
|
||||
current_line += lines_for_this_device;
|
||||
} else {
|
||||
break; // Can't fit more devices
|
||||
}
|
||||
}
|
||||
|
||||
// Add remaining space if any
|
||||
if constraints.len() < available_lines {
|
||||
constraints.push(Constraint::Min(0));
|
||||
}
|
||||
|
||||
let content_chunks = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints(constraints)
|
||||
.split(area);
|
||||
|
||||
let mut chunk_index = 0;
|
||||
|
||||
// Display each physical device
|
||||
for (physical_device, partitions) in &devices_to_show {
|
||||
// Device title
|
||||
let disk_title_text = format!("Disk {}:", physical_device);
|
||||
let disk_title_para =
|
||||
Paragraph::new(disk_title_text).style(Typography::widget_title());
|
||||
frame.render_widget(disk_title_para, content_chunks[chunk_index]);
|
||||
chunk_index += 1;
|
||||
|
||||
// Health status (one per physical device)
|
||||
let smart_health = metric_store
|
||||
.get_metric(hostname, &format!("disk_smart_{}_health", physical_device))
|
||||
.map(|m| (m.value.as_string(), m.status))
|
||||
.unwrap_or_else(|| ("Unknown".to_string(), Status::Unknown));
|
||||
|
||||
let smart_temp = metric_store
|
||||
.get_metric(
|
||||
hostname,
|
||||
&format!("disk_smart_{}_temperature", physical_device),
|
||||
)
|
||||
.and_then(|m| m.value.as_f32());
|
||||
|
||||
let temp_text = if let Some(temp) = smart_temp {
|
||||
format!(" {}°C", temp as i32)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let health_spans = StatusIcons::create_status_spans(
|
||||
smart_health.1,
|
||||
&format!("Health: {}{}", smart_health.0, temp_text),
|
||||
);
|
||||
let health_para = Paragraph::new(ratatui::text::Line::from(health_spans));
|
||||
frame.render_widget(health_para, content_chunks[chunk_index]);
|
||||
chunk_index += 1;
|
||||
|
||||
// Usage lines (one per partition/mount point)
|
||||
// Sort partitions by disk index for consistent ordering
|
||||
let mut sorted_partitions = partitions.clone();
|
||||
sorted_partitions.sort();
|
||||
for &disk_index in &sorted_partitions {
|
||||
let mount_point = metric_store
|
||||
.get_metric(hostname, &format!("disk_{}_mount_point", disk_index))
|
||||
.map(|m| m.value.as_string())
|
||||
.unwrap_or("?".to_string());
|
||||
|
||||
let usage_percent = metric_store
|
||||
.get_metric(hostname, &format!("disk_{}_usage_percent", disk_index))
|
||||
.and_then(|m| m.value.as_f32())
|
||||
.unwrap_or(0.0);
|
||||
|
||||
let used_gb = metric_store
|
||||
.get_metric(hostname, &format!("disk_{}_used_gb", disk_index))
|
||||
.and_then(|m| m.value.as_f32())
|
||||
.unwrap_or(0.0);
|
||||
|
||||
let total_gb = metric_store
|
||||
.get_metric(hostname, &format!("disk_{}_total_gb", disk_index))
|
||||
.and_then(|m| m.value.as_f32())
|
||||
.unwrap_or(0.0);
|
||||
|
||||
let usage_status = metric_store
|
||||
.get_metric(hostname, &format!("disk_{}_usage_percent", disk_index))
|
||||
.map(|m| m.status)
|
||||
.unwrap_or(Status::Unknown);
|
||||
|
||||
// Format mount point for usage line
|
||||
let mount_display = if mount_point == "/" {
|
||||
"root".to_string()
|
||||
} else if mount_point == "/boot" {
|
||||
"boot".to_string()
|
||||
} else if mount_point.starts_with("/") {
|
||||
mount_point[1..].to_string() // Remove leading slash
|
||||
} else {
|
||||
mount_point.clone()
|
||||
};
|
||||
|
||||
let usage_spans = StatusIcons::create_status_spans(
|
||||
usage_status,
|
||||
&format!(
|
||||
"Usage @{}: {:.1}% • {:.1}/{:.1} GB",
|
||||
mount_display, usage_percent, used_gb, total_gb
|
||||
),
|
||||
);
|
||||
let usage_para = Paragraph::new(ratatui::text::Line::from(usage_spans));
|
||||
frame.render_widget(usage_para, content_chunks[chunk_index]);
|
||||
chunk_index += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Show truncation indicator if we couldn't display all devices
|
||||
if devices_to_show.len() < physical_devices.len() {
|
||||
if let Some(last_chunk) = content_chunks.last() {
|
||||
let truncated_count = physical_devices.len() - devices_to_show.len();
|
||||
let truncated_text = format!(
|
||||
"... and {} more disk{}",
|
||||
truncated_count,
|
||||
if truncated_count == 1 { "" } else { "s" }
|
||||
);
|
||||
let truncated_para = Paragraph::new(truncated_text).style(Typography::muted());
|
||||
frame.render_widget(truncated_para, *last_chunk);
|
||||
}
|
||||
}
|
||||
if has_mac {
|
||||
lines.push(Line::from(Span::styled(
|
||||
"Press 'w' to wake up host",
|
||||
Style::default().fg(Theme::primary_text()).add_modifier(Modifier::BOLD),
|
||||
)));
|
||||
} else {
|
||||
// No host connected
|
||||
let content_chunks = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Length(1), Constraint::Min(0)])
|
||||
.split(area);
|
||||
|
||||
let disk_title = Paragraph::new("Storage:").style(Typography::widget_title());
|
||||
frame.render_widget(disk_title, content_chunks[0]);
|
||||
|
||||
let no_host_spans =
|
||||
StatusIcons::create_status_spans(Status::Unknown, "No host connected");
|
||||
let no_host_para = Paragraph::new(ratatui::text::Line::from(no_host_spans));
|
||||
frame.render_widget(no_host_para, content_chunks[1]);
|
||||
lines.push(Line::from(Span::styled(
|
||||
"No MAC address configured - cannot wake up",
|
||||
Style::default().fg(Theme::muted_text()),
|
||||
)));
|
||||
}
|
||||
|
||||
// Create centered message
|
||||
let message = Paragraph::new(lines)
|
||||
.block(Block::default()
|
||||
.borders(Borders::ALL)
|
||||
.border_style(Style::default().fg(Theme::muted_text()))
|
||||
.title(" Offline Host ")
|
||||
.title_style(Style::default().fg(Theme::muted_text()).add_modifier(Modifier::BOLD)))
|
||||
.style(Style::default().bg(Theme::background()).fg(Theme::primary_text()))
|
||||
.alignment(Alignment::Center);
|
||||
|
||||
// Center the message in the available area
|
||||
let popup_area = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Percentage(40),
|
||||
Constraint::Length(6),
|
||||
Constraint::Percentage(40),
|
||||
])
|
||||
.split(area)[1];
|
||||
|
||||
let popup_area = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([
|
||||
Constraint::Percentage(25),
|
||||
Constraint::Percentage(50),
|
||||
Constraint::Percentage(25),
|
||||
])
|
||||
.split(popup_area)[1];
|
||||
|
||||
frame.render_widget(message, popup_area);
|
||||
}
|
||||
|
||||
/// Parse MAC address string (e.g., "AA:BB:CC:DD:EE:FF") to [u8; 6]
|
||||
/// Get the connection IP for a hostname based on host configuration
|
||||
fn get_connection_ip(&self, hostname: &str) -> String {
|
||||
if let Some(host_details) = self.config.hosts.get(hostname) {
|
||||
host_details.get_connection_ip(hostname)
|
||||
} else {
|
||||
hostname.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_mac_address(mac_str: &str) -> Result<[u8; 6], &'static str> {
|
||||
let parts: Vec<&str> = mac_str.split(':').collect();
|
||||
if parts.len() != 6 {
|
||||
return Err("MAC address must have 6 parts separated by colons");
|
||||
}
|
||||
|
||||
let mut mac = [0u8; 6];
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
match u8::from_str_radix(part, 16) {
|
||||
Ok(byte) => mac[i] = byte,
|
||||
Err(_) => return Err("Invalid hexadecimal byte in MAC address"),
|
||||
}
|
||||
}
|
||||
Ok(mac)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use ratatui::style::{Color, Style, Modifier};
|
||||
use ratatui::widgets::{Block, Borders};
|
||||
use cm_dashboard_shared::Status;
|
||||
use ratatui::style::{Color, Modifier, Style};
|
||||
use ratatui::widgets::{Block, Borders};
|
||||
|
||||
/// Complete terminal color palette matching your configuration
|
||||
#[allow(dead_code)]
|
||||
@@ -10,7 +10,7 @@ pub struct TerminalColors {
|
||||
pub dim_foreground: Color,
|
||||
pub bright_foreground: Color,
|
||||
pub background: Color,
|
||||
|
||||
|
||||
// Normal colors
|
||||
pub normal_black: Color,
|
||||
pub normal_red: Color,
|
||||
@@ -20,7 +20,7 @@ pub struct TerminalColors {
|
||||
pub normal_magenta: Color,
|
||||
pub normal_cyan: Color,
|
||||
pub normal_white: Color,
|
||||
|
||||
|
||||
// Bright colors
|
||||
pub bright_black: Color,
|
||||
pub bright_red: Color,
|
||||
@@ -30,7 +30,7 @@ pub struct TerminalColors {
|
||||
pub bright_magenta: Color,
|
||||
pub bright_cyan: Color,
|
||||
pub bright_white: Color,
|
||||
|
||||
|
||||
// Dim colors
|
||||
pub dim_black: Color,
|
||||
pub dim_red: Color,
|
||||
@@ -46,40 +46,40 @@ impl Default for TerminalColors {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
// Primary colors
|
||||
foreground: Color::Rgb(198, 198, 198), // #c6c6c6
|
||||
dim_foreground: Color::Rgb(112, 112, 112), // #707070
|
||||
foreground: Color::Rgb(198, 198, 198), // #c6c6c6
|
||||
dim_foreground: Color::Rgb(112, 112, 112), // #707070
|
||||
bright_foreground: Color::Rgb(255, 255, 255), // #ffffff
|
||||
background: Color::Rgb(38, 38, 38), // #262626
|
||||
|
||||
background: Color::Rgb(38, 38, 38), // #262626
|
||||
|
||||
// Normal colors
|
||||
normal_black: Color::Rgb(0, 0, 0), // #000000
|
||||
normal_red: Color::Rgb(215, 84, 0), // #d75400
|
||||
normal_green: Color::Rgb(175, 215, 135), // #afd787
|
||||
normal_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||
normal_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||
normal_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||
normal_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||
normal_white: Color::Rgb(238, 238, 238), // #eeeeee
|
||||
|
||||
normal_black: Color::Rgb(0, 0, 0), // #000000
|
||||
normal_red: Color::Rgb(215, 84, 0), // #d75400
|
||||
normal_green: Color::Rgb(175, 215, 135), // #afd787
|
||||
normal_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||
normal_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||
normal_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||
normal_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||
normal_white: Color::Rgb(238, 238, 238), // #eeeeee
|
||||
|
||||
// Bright colors
|
||||
bright_black: Color::Rgb(48, 48, 48), // #303030
|
||||
bright_red: Color::Rgb(215, 84, 0), // #d75400
|
||||
bright_green: Color::Rgb(175, 215, 135), // #afd787
|
||||
bright_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||
bright_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||
bright_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||
bright_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||
bright_white: Color::Rgb(255, 255, 255), // #ffffff
|
||||
|
||||
bright_black: Color::Rgb(48, 48, 48), // #303030
|
||||
bright_red: Color::Rgb(215, 84, 0), // #d75400
|
||||
bright_green: Color::Rgb(175, 215, 135), // #afd787
|
||||
bright_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||
bright_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||
bright_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||
bright_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||
bright_white: Color::Rgb(255, 255, 255), // #ffffff
|
||||
|
||||
// Dim colors
|
||||
dim_black: Color::Rgb(0, 0, 0), // #000000
|
||||
dim_red: Color::Rgb(215, 84, 0), // #d75400
|
||||
dim_green: Color::Rgb(175, 215, 135), // #afd787
|
||||
dim_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||
dim_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||
dim_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||
dim_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||
dim_white: Color::Rgb(221, 221, 221), // #dddddd
|
||||
dim_black: Color::Rgb(0, 0, 0), // #000000
|
||||
dim_red: Color::Rgb(215, 84, 0), // #d75400
|
||||
dim_green: Color::Rgb(175, 215, 135), // #afd787
|
||||
dim_yellow: Color::Rgb(215, 175, 95), // #d7af5f
|
||||
dim_blue: Color::Rgb(135, 175, 215), // #87afd7
|
||||
dim_magenta: Color::Rgb(215, 215, 175), // #d7d7af
|
||||
dim_cyan: Color::Rgb(160, 160, 160), // #a0a0a0
|
||||
dim_white: Color::Rgb(221, 221, 221), // #dddddd
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -93,67 +93,70 @@ impl Theme {
|
||||
static COLORS: std::sync::OnceLock<TerminalColors> = std::sync::OnceLock::new();
|
||||
COLORS.get_or_init(TerminalColors::default)
|
||||
}
|
||||
|
||||
|
||||
// Semantic color mapping using the terminal color struct
|
||||
pub fn primary_text() -> Color {
|
||||
Self::colors().normal_white
|
||||
}
|
||||
|
||||
|
||||
pub fn secondary_text() -> Color {
|
||||
Self::colors().foreground
|
||||
}
|
||||
|
||||
|
||||
pub fn muted_text() -> Color {
|
||||
Self::colors().dim_foreground
|
||||
}
|
||||
|
||||
|
||||
pub fn border() -> Color {
|
||||
Self::colors().dim_foreground
|
||||
}
|
||||
|
||||
|
||||
pub fn border_title() -> Color {
|
||||
Self::colors().bright_white
|
||||
}
|
||||
|
||||
|
||||
pub fn background() -> Color {
|
||||
Self::colors().background
|
||||
}
|
||||
|
||||
|
||||
pub fn success() -> Color {
|
||||
Self::colors().normal_green
|
||||
}
|
||||
|
||||
|
||||
pub fn warning() -> Color {
|
||||
Self::colors().normal_yellow
|
||||
}
|
||||
|
||||
|
||||
pub fn error() -> Color {
|
||||
Self::colors().normal_red
|
||||
}
|
||||
|
||||
|
||||
pub fn info() -> Color {
|
||||
Self::colors().normal_cyan
|
||||
}
|
||||
|
||||
|
||||
pub fn highlight() -> Color {
|
||||
Self::colors().normal_blue
|
||||
}
|
||||
|
||||
|
||||
/// Get color for status level
|
||||
pub fn status_color(status: Status) -> Color {
|
||||
match status {
|
||||
Status::Ok => Self::success(),
|
||||
Status::Inactive => Self::muted_text(), // Gray for inactive services in service list
|
||||
Status::Pending => Self::highlight(), // Blue for pending
|
||||
Status::Warning => Self::warning(),
|
||||
Status::Critical => Self::error(),
|
||||
Status::Unknown => Self::muted_text(),
|
||||
Status::Offline => Self::muted_text(), // Dark gray for offline
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get style for status level
|
||||
pub fn status_style(status: Status) -> Style {
|
||||
Style::default().fg(Self::status_color(status))
|
||||
}
|
||||
|
||||
|
||||
/// CPU usage colors using terminal color struct
|
||||
pub fn cpu_color(percentage: u16) -> Color {
|
||||
match percentage {
|
||||
@@ -164,7 +167,7 @@ impl Theme {
|
||||
_ => Self::colors().normal_red, // Over 100%
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Memory usage colors using terminal color struct
|
||||
pub fn memory_color(percentage: u16) -> Color {
|
||||
match percentage {
|
||||
@@ -175,7 +178,7 @@ impl Theme {
|
||||
_ => Self::colors().normal_red, // Over 100%
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get gauge color based on percentage
|
||||
pub fn gauge_color(percentage: u16, warning_threshold: u16, critical_threshold: u16) -> Color {
|
||||
if percentage >= critical_threshold {
|
||||
@@ -186,25 +189,31 @@ impl Theme {
|
||||
Self::success()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Widget border style
|
||||
pub fn widget_border_style() -> Style {
|
||||
Style::default().fg(Self::border()).bg(Self::background())
|
||||
}
|
||||
|
||||
|
||||
/// Inactive widget border style
|
||||
pub fn widget_border_inactive_style() -> Style {
|
||||
Style::default().fg(Self::muted_text()).bg(Self::background())
|
||||
Style::default()
|
||||
.fg(Self::muted_text())
|
||||
.bg(Self::background())
|
||||
}
|
||||
|
||||
|
||||
/// Title style
|
||||
pub fn title_style() -> Style {
|
||||
Style::default().fg(Self::border_title()).bg(Self::background())
|
||||
Style::default()
|
||||
.fg(Self::border_title())
|
||||
.bg(Self::background())
|
||||
}
|
||||
|
||||
|
||||
/// Status bar style
|
||||
pub fn status_bar_style() -> Style {
|
||||
Style::default().fg(Self::muted_text()).bg(Self::background())
|
||||
Style::default()
|
||||
.fg(Self::muted_text())
|
||||
.bg(Self::background())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -216,13 +225,6 @@ impl Layout {
|
||||
pub const LEFT_PANEL_WIDTH: u16 = 45;
|
||||
/// Right panel percentage (services)
|
||||
pub const RIGHT_PANEL_WIDTH: u16 = 55;
|
||||
/// System vs backup split (equal)
|
||||
pub const SYSTEM_PANEL_HEIGHT: u16 = 50;
|
||||
pub const BACKUP_PANEL_HEIGHT: u16 = 50;
|
||||
/// System panel CPU section height
|
||||
pub const CPU_SECTION_HEIGHT: u16 = 2;
|
||||
/// System panel memory section height
|
||||
pub const MEMORY_SECTION_HEIGHT: u16 = 3;
|
||||
}
|
||||
|
||||
/// Typography system
|
||||
@@ -239,35 +241,41 @@ impl StatusIcons {
|
||||
pub fn get_icon(status: Status) -> &'static str {
|
||||
match status {
|
||||
Status::Ok => "●",
|
||||
Status::Inactive => "○", // Empty circle for inactive services
|
||||
Status::Pending => "◉", // Hollow circle for pending
|
||||
Status::Warning => "◐",
|
||||
Status::Critical => "◯",
|
||||
Status::Critical => "!",
|
||||
Status::Unknown => "?",
|
||||
Status::Offline => "○", // Empty circle for offline
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Create spans with status icon colored and text in foreground color
|
||||
pub fn create_status_spans(status: Status, text: &str) -> Vec<ratatui::text::Span<'static>> {
|
||||
let icon = Self::get_icon(status);
|
||||
let status_color = match status {
|
||||
Status::Ok => Theme::success(), // Green
|
||||
Status::Warning => Theme::warning(), // Yellow
|
||||
Status::Critical => Theme::error(), // Red
|
||||
Status::Ok => Theme::success(), // Green
|
||||
Status::Inactive => Theme::muted_text(), // Gray for inactive services
|
||||
Status::Pending => Theme::highlight(), // Blue
|
||||
Status::Warning => Theme::warning(), // Yellow
|
||||
Status::Critical => Theme::error(), // Red
|
||||
Status::Unknown => Theme::muted_text(), // Gray
|
||||
Status::Offline => Theme::muted_text(), // Dark gray for offline
|
||||
};
|
||||
|
||||
|
||||
vec![
|
||||
ratatui::text::Span::styled(
|
||||
format!("{} ", icon),
|
||||
Style::default().fg(status_color).bg(Theme::background())
|
||||
Style::default().fg(status_color).bg(Theme::background()),
|
||||
),
|
||||
ratatui::text::Span::styled(
|
||||
text.to_string(),
|
||||
Style::default().fg(Theme::secondary_text()).bg(Theme::background())
|
||||
Style::default()
|
||||
.fg(Theme::secondary_text())
|
||||
.bg(Theme::background()),
|
||||
),
|
||||
]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Components {
|
||||
@@ -277,19 +285,17 @@ impl Components {
|
||||
.title(title)
|
||||
.borders(Borders::ALL)
|
||||
.style(Style::default().fg(Theme::border()).bg(Theme::background()))
|
||||
.title_style(Style::default().fg(Theme::border_title()).bg(Theme::background()))
|
||||
.title_style(
|
||||
Style::default()
|
||||
.fg(Theme::border_title())
|
||||
.bg(Theme::background()),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
impl Typography {
|
||||
/// Main title style (dashboard header)
|
||||
pub fn title() -> Style {
|
||||
Style::default()
|
||||
.fg(Theme::primary_text())
|
||||
.bg(Theme::background())
|
||||
}
|
||||
|
||||
|
||||
/// Widget title style (panel headers) - bold bright white
|
||||
pub fn widget_title() -> Style {
|
||||
Style::default()
|
||||
@@ -297,14 +303,14 @@ impl Typography {
|
||||
.bg(Theme::background())
|
||||
.add_modifier(Modifier::BOLD)
|
||||
}
|
||||
|
||||
|
||||
/// Secondary content text
|
||||
pub fn secondary() -> Style {
|
||||
Style::default()
|
||||
.fg(Theme::secondary_text())
|
||||
.bg(Theme::background())
|
||||
}
|
||||
|
||||
|
||||
/// Muted text (inactive items, placeholders) - now bold bright white for headers
|
||||
pub fn muted() -> Style {
|
||||
Style::default()
|
||||
@@ -312,5 +318,11 @@ impl Typography {
|
||||
.bg(Theme::background())
|
||||
.add_modifier(Modifier::BOLD)
|
||||
}
|
||||
|
||||
|
||||
/// Tree symbols style (blue color)
|
||||
pub fn tree() -> Style {
|
||||
Style::default()
|
||||
.fg(Theme::highlight())
|
||||
.bg(Theme::background())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,482 +0,0 @@
|
||||
use cm_dashboard_shared::{Metric, Status};
|
||||
use ratatui::{
|
||||
layout::{Constraint, Direction, Layout, Rect},
|
||||
widgets::Paragraph,
|
||||
Frame,
|
||||
};
|
||||
use tracing::debug;
|
||||
|
||||
use super::Widget;
|
||||
use crate::ui::theme::{StatusIcons, Typography};
|
||||
|
||||
/// Backup widget displaying backup status, services, and repository information
|
||||
#[derive(Clone)]
|
||||
pub struct BackupWidget {
|
||||
/// Overall backup status
|
||||
overall_status: Status,
|
||||
/// Last backup duration in seconds
|
||||
duration_seconds: Option<i64>,
|
||||
/// Last backup timestamp
|
||||
last_run_timestamp: Option<i64>,
|
||||
/// Total number of backup services
|
||||
total_services: Option<i64>,
|
||||
/// Total repository size in GB
|
||||
total_repo_size_gb: Option<f32>,
|
||||
/// Total disk space for backups in GB
|
||||
backup_disk_total_gb: Option<f32>,
|
||||
/// Used disk space for backups in GB
|
||||
backup_disk_used_gb: Option<f32>,
|
||||
/// Backup disk product name from SMART data
|
||||
backup_disk_product_name: Option<String>,
|
||||
/// Backup disk serial number from SMART data
|
||||
backup_disk_serial_number: Option<String>,
|
||||
/// Backup disk filesystem label
|
||||
backup_disk_filesystem_label: Option<String>,
|
||||
/// Number of completed services
|
||||
services_completed_count: Option<i64>,
|
||||
/// Number of failed services
|
||||
services_failed_count: Option<i64>,
|
||||
/// Number of disabled services
|
||||
services_disabled_count: Option<i64>,
|
||||
/// All individual service metrics for detailed display
|
||||
service_metrics: Vec<ServiceMetricData>,
|
||||
/// Last update indicator
|
||||
has_data: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceMetricData {
|
||||
name: String,
|
||||
status: Status,
|
||||
exit_code: Option<i64>,
|
||||
archive_count: Option<i64>,
|
||||
repo_size_gb: Option<f32>,
|
||||
}
|
||||
|
||||
impl BackupWidget {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
overall_status: Status::Unknown,
|
||||
duration_seconds: None,
|
||||
last_run_timestamp: None,
|
||||
total_services: None,
|
||||
total_repo_size_gb: None,
|
||||
backup_disk_total_gb: None,
|
||||
backup_disk_used_gb: None,
|
||||
backup_disk_product_name: None,
|
||||
backup_disk_serial_number: None,
|
||||
backup_disk_filesystem_label: None,
|
||||
services_completed_count: None,
|
||||
services_failed_count: None,
|
||||
services_disabled_count: None,
|
||||
service_metrics: Vec::new(),
|
||||
has_data: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the backup widget has any data to display
|
||||
pub fn has_data(&self) -> bool {
|
||||
self.has_data
|
||||
}
|
||||
|
||||
|
||||
/// Format duration for display
|
||||
fn format_duration(&self) -> String {
|
||||
match self.duration_seconds {
|
||||
Some(seconds) => {
|
||||
if seconds >= 3600 {
|
||||
format!("{:.1}h", seconds as f32 / 3600.0)
|
||||
} else if seconds >= 60 {
|
||||
format!("{:.1}m", seconds as f32 / 60.0)
|
||||
} else {
|
||||
format!("{}s", seconds)
|
||||
}
|
||||
}
|
||||
None => "—".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format timestamp for display
|
||||
fn format_last_run(&self) -> String {
|
||||
match self.last_run_timestamp {
|
||||
Some(timestamp) => {
|
||||
let duration = chrono::Utc::now().timestamp() - timestamp;
|
||||
if duration < 3600 {
|
||||
format!("{}m ago", duration / 60)
|
||||
} else if duration < 86400 {
|
||||
format!("{}h ago", duration / 3600)
|
||||
} else {
|
||||
format!("{}d ago", duration / 86400)
|
||||
}
|
||||
}
|
||||
None => "—".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format disk usage in format "usedGB/totalGB"
|
||||
fn format_repo_size(&self) -> String {
|
||||
match (self.backup_disk_used_gb, self.backup_disk_total_gb) {
|
||||
(Some(used_gb), Some(total_gb)) => {
|
||||
let used_str = Self::format_size_with_proper_units(used_gb);
|
||||
let total_str = Self::format_size_with_proper_units(total_gb);
|
||||
format!("{}/{}", used_str, total_str)
|
||||
}
|
||||
(Some(used_gb), None) => {
|
||||
// Fallback to just used size if total not available
|
||||
Self::format_size_with_proper_units(used_gb)
|
||||
}
|
||||
_ => "—".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format size with proper units (xxxkB/MB/GB/TB)
|
||||
fn format_size_with_proper_units(size_gb: f32) -> String {
|
||||
if size_gb >= 1000.0 {
|
||||
// TB range
|
||||
format!("{:.1}TB", size_gb / 1000.0)
|
||||
} else if size_gb >= 1.0 {
|
||||
// GB range
|
||||
format!("{:.1}GB", size_gb)
|
||||
} else if size_gb >= 0.001 {
|
||||
// MB range (size_gb * 1024 = MB)
|
||||
let size_mb = size_gb * 1024.0;
|
||||
format!("{:.1}MB", size_mb)
|
||||
} else if size_gb >= 0.000001 {
|
||||
// kB range (size_gb * 1024 * 1024 = kB)
|
||||
let size_kb = size_gb * 1024.0 * 1024.0;
|
||||
format!("{:.0}kB", size_kb)
|
||||
} else {
|
||||
// B range (size_gb * 1024^3 = bytes)
|
||||
let size_bytes = size_gb * 1024.0 * 1024.0 * 1024.0;
|
||||
format!("{:.0}B", size_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
/// Format product name display
|
||||
fn format_product_name(&self) -> String {
|
||||
if let Some(ref product_name) = self.backup_disk_product_name {
|
||||
format!("P/N: {}", product_name)
|
||||
} else {
|
||||
"P/N: Unknown".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Format serial number display
|
||||
fn format_serial_number(&self) -> String {
|
||||
if let Some(ref serial) = self.backup_disk_serial_number {
|
||||
format!("S/N: {}", serial)
|
||||
} else {
|
||||
"S/N: Unknown".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract service name from metric name (e.g., "backup_service_gitea_status" -> "gitea")
|
||||
fn extract_service_name(metric_name: &str) -> Option<String> {
|
||||
if metric_name.starts_with("backup_service_") {
|
||||
let name_part = &metric_name[15..]; // Remove "backup_service_" prefix
|
||||
|
||||
// Try to extract service name by removing known suffixes
|
||||
if let Some(service_name) = name_part.strip_suffix("_status") {
|
||||
Some(service_name.to_string())
|
||||
} else if let Some(service_name) = name_part.strip_suffix("_exit_code") {
|
||||
Some(service_name.to_string())
|
||||
} else if let Some(service_name) = name_part.strip_suffix("_archive_count") {
|
||||
Some(service_name.to_string())
|
||||
} else if let Some(service_name) = name_part.strip_suffix("_repo_size_gb") {
|
||||
Some(service_name.to_string())
|
||||
} else if let Some(service_name) = name_part.strip_suffix("_repo_path") {
|
||||
Some(service_name.to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Widget for BackupWidget {
|
||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||
debug!("Backup widget updating with {} metrics", metrics.len());
|
||||
for metric in metrics {
|
||||
debug!(
|
||||
"Backup metric: {} = {:?} (status: {:?})",
|
||||
metric.name, metric.value, metric.status
|
||||
);
|
||||
}
|
||||
|
||||
// Also debug the service_data after processing
|
||||
debug!("Processing individual service metrics...");
|
||||
|
||||
// Log how many metrics are backup service metrics
|
||||
let service_metric_count = metrics
|
||||
.iter()
|
||||
.filter(|m| m.name.starts_with("backup_service_"))
|
||||
.count();
|
||||
debug!(
|
||||
"Found {} backup_service_ metrics out of {} total backup metrics",
|
||||
service_metric_count,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
// Reset service metrics
|
||||
self.service_metrics.clear();
|
||||
let mut service_data: std::collections::HashMap<String, ServiceMetricData> =
|
||||
std::collections::HashMap::new();
|
||||
|
||||
for metric in metrics {
|
||||
match metric.name.as_str() {
|
||||
"backup_overall_status" => {
|
||||
let status_str = metric.value.as_string();
|
||||
self.overall_status = match status_str.as_str() {
|
||||
"ok" => Status::Ok,
|
||||
"warning" => Status::Warning,
|
||||
"critical" => Status::Critical,
|
||||
_ => Status::Unknown,
|
||||
};
|
||||
}
|
||||
"backup_duration_seconds" => {
|
||||
self.duration_seconds = metric.value.as_i64();
|
||||
}
|
||||
"backup_last_run_timestamp" => {
|
||||
self.last_run_timestamp = metric.value.as_i64();
|
||||
}
|
||||
"backup_total_services" => {
|
||||
self.total_services = metric.value.as_i64();
|
||||
}
|
||||
"backup_total_repo_size_gb" => {
|
||||
self.total_repo_size_gb = metric.value.as_f32();
|
||||
}
|
||||
"backup_disk_total_gb" => {
|
||||
self.backup_disk_total_gb = metric.value.as_f32();
|
||||
}
|
||||
"backup_disk_used_gb" => {
|
||||
self.backup_disk_used_gb = metric.value.as_f32();
|
||||
}
|
||||
"backup_disk_product_name" => {
|
||||
self.backup_disk_product_name = Some(metric.value.as_string());
|
||||
}
|
||||
"backup_disk_serial_number" => {
|
||||
self.backup_disk_serial_number = Some(metric.value.as_string());
|
||||
}
|
||||
"backup_disk_filesystem_label" => {
|
||||
self.backup_disk_filesystem_label = Some(metric.value.as_string());
|
||||
}
|
||||
"backup_services_completed_count" => {
|
||||
self.services_completed_count = metric.value.as_i64();
|
||||
}
|
||||
"backup_services_failed_count" => {
|
||||
self.services_failed_count = metric.value.as_i64();
|
||||
}
|
||||
"backup_services_disabled_count" => {
|
||||
self.services_disabled_count = metric.value.as_i64();
|
||||
}
|
||||
_ => {
|
||||
// Handle individual service metrics
|
||||
if let Some(service_name) = Self::extract_service_name(&metric.name) {
|
||||
debug!(
|
||||
"Extracted service name '{}' from metric '{}'",
|
||||
service_name, metric.name
|
||||
);
|
||||
let entry = service_data.entry(service_name.clone()).or_insert_with(|| {
|
||||
ServiceMetricData {
|
||||
name: service_name,
|
||||
status: Status::Unknown,
|
||||
exit_code: None,
|
||||
archive_count: None,
|
||||
repo_size_gb: None,
|
||||
}
|
||||
});
|
||||
|
||||
if metric.name.ends_with("_status") {
|
||||
entry.status = metric.status;
|
||||
debug!("Set status for {}: {:?}", entry.name, entry.status);
|
||||
} else if metric.name.ends_with("_exit_code") {
|
||||
entry.exit_code = metric.value.as_i64();
|
||||
} else if metric.name.ends_with("_archive_count") {
|
||||
entry.archive_count = metric.value.as_i64();
|
||||
debug!(
|
||||
"Set archive_count for {}: {:?}",
|
||||
entry.name, entry.archive_count
|
||||
);
|
||||
} else if metric.name.ends_with("_repo_size_gb") {
|
||||
entry.repo_size_gb = metric.value.as_f32();
|
||||
debug!(
|
||||
"Set repo_size_gb for {}: {:?}",
|
||||
entry.name, entry.repo_size_gb
|
||||
);
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"Could not extract service name from metric: {}",
|
||||
metric.name
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert service data to sorted vector
|
||||
let mut services: Vec<ServiceMetricData> = service_data.into_values().collect();
|
||||
services.sort_by(|a, b| a.name.cmp(&b.name));
|
||||
self.service_metrics = services;
|
||||
|
||||
self.has_data = !metrics.is_empty();
|
||||
|
||||
debug!(
|
||||
"Backup widget updated: status={:?}, services={}, total_size={:?}GB",
|
||||
self.overall_status,
|
||||
self.service_metrics.len(),
|
||||
self.total_repo_size_gb
|
||||
);
|
||||
|
||||
// Debug individual service data
|
||||
for service in &self.service_metrics {
|
||||
debug!(
|
||||
"Service {}: status={:?}, archives={:?}, size={:?}GB",
|
||||
service.name, service.status, service.archive_count, service.repo_size_gb
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||
// Split area into header and services list
|
||||
let chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Length(6), // Header with "Latest backup" title, status, P/N, and S/N
|
||||
Constraint::Min(0), // Service list
|
||||
])
|
||||
.split(area);
|
||||
|
||||
// Render backup overview
|
||||
self.render_backup_overview(frame, chunks[0]);
|
||||
|
||||
// Render services list
|
||||
self.render_services_list(frame, chunks[1]);
|
||||
}
|
||||
}
|
||||
|
||||
impl BackupWidget {
|
||||
/// Render backup overview section
|
||||
fn render_backup_overview(&self, frame: &mut Frame, area: Rect) {
|
||||
let content_chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Length(1), // "Latest backup" header
|
||||
Constraint::Length(1), // Status line
|
||||
Constraint::Length(1), // Duration and last run
|
||||
Constraint::Length(1), // Repository size
|
||||
Constraint::Length(1), // Product name
|
||||
Constraint::Length(1), // Serial number
|
||||
Constraint::Min(0), // Remaining space
|
||||
])
|
||||
.split(area);
|
||||
|
||||
// "Latest backup" header
|
||||
let header_para = Paragraph::new("Latest backup:").style(Typography::widget_title());
|
||||
frame.render_widget(header_para, content_chunks[0]);
|
||||
|
||||
// Status line
|
||||
let status_text = format!(
|
||||
"Status: {}",
|
||||
match self.overall_status {
|
||||
Status::Ok => "OK",
|
||||
Status::Warning => "Warning",
|
||||
Status::Critical => "Failed",
|
||||
Status::Unknown => "Unknown",
|
||||
}
|
||||
);
|
||||
let status_spans = StatusIcons::create_status_spans(self.overall_status, &status_text);
|
||||
let status_para = Paragraph::new(ratatui::text::Line::from(status_spans));
|
||||
frame.render_widget(status_para, content_chunks[1]);
|
||||
|
||||
// Duration and last run
|
||||
let time_text = format!(
|
||||
"Duration: {} • Last: {}",
|
||||
self.format_duration(),
|
||||
self.format_last_run()
|
||||
);
|
||||
let time_para = Paragraph::new(time_text).style(Typography::secondary());
|
||||
frame.render_widget(time_para, content_chunks[2]);
|
||||
|
||||
// Repository size
|
||||
let size_text = format!("Disk usage: {}", self.format_repo_size());
|
||||
let size_para = Paragraph::new(size_text).style(Typography::secondary());
|
||||
frame.render_widget(size_para, content_chunks[3]);
|
||||
|
||||
// Product name
|
||||
let product_text = self.format_product_name();
|
||||
let product_para = Paragraph::new(product_text).style(Typography::secondary());
|
||||
frame.render_widget(product_para, content_chunks[4]);
|
||||
|
||||
// Serial number
|
||||
let serial_text = self.format_serial_number();
|
||||
let serial_para = Paragraph::new(serial_text).style(Typography::secondary());
|
||||
frame.render_widget(serial_para, content_chunks[5]);
|
||||
}
|
||||
|
||||
/// Render services list section
|
||||
fn render_services_list(&self, frame: &mut Frame, area: Rect) {
|
||||
if area.height < 1 {
|
||||
return;
|
||||
}
|
||||
|
||||
let available_lines = area.height as usize;
|
||||
let services_to_show = self.service_metrics.iter().take(available_lines);
|
||||
|
||||
let mut y_offset = 0;
|
||||
for service in services_to_show {
|
||||
if y_offset >= available_lines {
|
||||
break;
|
||||
}
|
||||
|
||||
let service_area = Rect {
|
||||
x: area.x,
|
||||
y: area.y + y_offset as u16,
|
||||
width: area.width,
|
||||
height: 1,
|
||||
};
|
||||
|
||||
let service_info = if let (Some(archives), Some(size_gb)) =
|
||||
(service.archive_count, service.repo_size_gb)
|
||||
{
|
||||
let size_str = Self::format_size_with_proper_units(size_gb);
|
||||
format!(" {}archives {}", archives, size_str)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let service_text = format!("{}{}", service.name, service_info);
|
||||
let service_spans = StatusIcons::create_status_spans(service.status, &service_text);
|
||||
let service_para = Paragraph::new(ratatui::text::Line::from(service_spans));
|
||||
|
||||
frame.render_widget(service_para, service_area);
|
||||
y_offset += 1;
|
||||
}
|
||||
|
||||
// If there are more services than we can show, indicate that
|
||||
if self.service_metrics.len() > available_lines {
|
||||
let more_count = self.service_metrics.len() - available_lines;
|
||||
if available_lines > 0 {
|
||||
let last_line_area = Rect {
|
||||
x: area.x,
|
||||
y: area.y + (available_lines - 1) as u16,
|
||||
width: area.width,
|
||||
height: 1,
|
||||
};
|
||||
|
||||
let more_text = format!("... and {} more services", more_count);
|
||||
let more_para = Paragraph::new(more_text).style(Typography::muted());
|
||||
|
||||
frame.render_widget(more_para, last_line_area);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BackupWidget {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
@@ -1,133 +0,0 @@
|
||||
use cm_dashboard_shared::{Metric, Status};
|
||||
use ratatui::{
|
||||
layout::{Constraint, Direction, Layout, Rect},
|
||||
widgets::Paragraph,
|
||||
Frame,
|
||||
};
|
||||
use tracing::debug;
|
||||
|
||||
use super::Widget;
|
||||
use crate::ui::theme::{Typography, StatusIcons};
|
||||
|
||||
/// CPU widget displaying load, temperature, and frequency
|
||||
#[derive(Clone)]
|
||||
pub struct CpuWidget {
|
||||
/// CPU load averages (1, 5, 15 minutes)
|
||||
load_1min: Option<f32>,
|
||||
load_5min: Option<f32>,
|
||||
load_15min: Option<f32>,
|
||||
/// CPU temperature in Celsius
|
||||
temperature: Option<f32>,
|
||||
/// CPU frequency in MHz
|
||||
frequency: Option<f32>,
|
||||
/// Aggregated status
|
||||
status: Status,
|
||||
/// Last update indicator
|
||||
has_data: bool,
|
||||
}
|
||||
|
||||
impl CpuWidget {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
load_1min: None,
|
||||
load_5min: None,
|
||||
load_15min: None,
|
||||
temperature: None,
|
||||
frequency: None,
|
||||
status: Status::Unknown,
|
||||
has_data: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Format load average for display
|
||||
fn format_load(&self) -> String {
|
||||
match (self.load_1min, self.load_5min, self.load_15min) {
|
||||
(Some(l1), Some(l5), Some(l15)) => {
|
||||
format!("{:.2} {:.2} {:.2}", l1, l5, l15)
|
||||
}
|
||||
_ => "— — —".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format frequency for display
|
||||
fn format_frequency(&self) -> String {
|
||||
match self.frequency {
|
||||
Some(freq) => format!("{:.1} MHz", freq),
|
||||
None => "— MHz".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Widget for CpuWidget {
|
||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||
debug!("CPU widget updating with {} metrics", metrics.len());
|
||||
|
||||
// Reset status aggregation
|
||||
let mut statuses = Vec::new();
|
||||
|
||||
for metric in metrics {
|
||||
match metric.name.as_str() {
|
||||
"cpu_load_1min" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.load_1min = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"cpu_load_5min" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.load_5min = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"cpu_load_15min" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.load_15min = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"cpu_temperature_celsius" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.temperature = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"cpu_frequency_mhz" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.frequency = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregate status
|
||||
self.status = if statuses.is_empty() {
|
||||
Status::Unknown
|
||||
} else {
|
||||
Status::aggregate(&statuses)
|
||||
};
|
||||
|
||||
self.has_data = !metrics.is_empty();
|
||||
|
||||
debug!("CPU widget updated: load={:?}, temp={:?}, freq={:?}, status={:?}",
|
||||
self.load_1min, self.temperature, self.frequency, self.status);
|
||||
}
|
||||
|
||||
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||
let content_chunks = Layout::default().direction(Direction::Vertical).constraints([Constraint::Length(1), Constraint::Length(1)]).split(area);
|
||||
let cpu_title = Paragraph::new("CPU:").style(Typography::widget_title());
|
||||
frame.render_widget(cpu_title, content_chunks[0]);
|
||||
let load_freq_spans = StatusIcons::create_status_spans(self.status, &format!("Load: {} • {}", self.format_load(), self.format_frequency()));
|
||||
let load_freq_para = Paragraph::new(ratatui::text::Line::from(load_freq_spans));
|
||||
frame.render_widget(load_freq_para, content_chunks[1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Default for CpuWidget {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
@@ -1,240 +0,0 @@
|
||||
use cm_dashboard_shared::{Metric, Status};
|
||||
use ratatui::{
|
||||
layout::{Constraint, Direction, Layout, Rect},
|
||||
widgets::Paragraph,
|
||||
Frame,
|
||||
};
|
||||
use tracing::debug;
|
||||
|
||||
use super::Widget;
|
||||
use crate::ui::theme::{Typography, StatusIcons};
|
||||
|
||||
/// Memory widget displaying usage, totals, and swap information
|
||||
#[derive(Clone)]
|
||||
pub struct MemoryWidget {
|
||||
/// Memory usage percentage
|
||||
usage_percent: Option<f32>,
|
||||
/// Total memory in GB
|
||||
total_gb: Option<f32>,
|
||||
/// Used memory in GB
|
||||
used_gb: Option<f32>,
|
||||
/// Available memory in GB
|
||||
available_gb: Option<f32>,
|
||||
/// Total swap in GB
|
||||
swap_total_gb: Option<f32>,
|
||||
/// Used swap in GB
|
||||
swap_used_gb: Option<f32>,
|
||||
/// /tmp directory size in MB
|
||||
tmp_size_mb: Option<f32>,
|
||||
/// /tmp total size in MB
|
||||
tmp_total_mb: Option<f32>,
|
||||
/// /tmp usage percentage
|
||||
tmp_usage_percent: Option<f32>,
|
||||
/// Aggregated status
|
||||
status: Status,
|
||||
/// Last update indicator
|
||||
has_data: bool,
|
||||
}
|
||||
|
||||
impl MemoryWidget {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
usage_percent: None,
|
||||
total_gb: None,
|
||||
used_gb: None,
|
||||
available_gb: None,
|
||||
swap_total_gb: None,
|
||||
swap_used_gb: None,
|
||||
tmp_size_mb: None,
|
||||
tmp_total_mb: None,
|
||||
tmp_usage_percent: None,
|
||||
status: Status::Unknown,
|
||||
has_data: false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get memory usage percentage for gauge
|
||||
fn get_memory_percentage(&self) -> u16 {
|
||||
match self.usage_percent {
|
||||
Some(percent) => percent.min(100.0).max(0.0) as u16,
|
||||
None => {
|
||||
// Calculate from used/total if percentage not available
|
||||
match (self.used_gb, self.total_gb) {
|
||||
(Some(used), Some(total)) if total > 0.0 => {
|
||||
let percent = (used / total * 100.0).min(100.0).max(0.0);
|
||||
percent as u16
|
||||
}
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Format size with proper units (kB/MB/GB)
|
||||
fn format_size_units(size_mb: f32) -> String {
|
||||
if size_mb >= 1024.0 {
|
||||
// Convert to GB
|
||||
let size_gb = size_mb / 1024.0;
|
||||
format!("{:.1}GB", size_gb)
|
||||
} else if size_mb >= 1.0 {
|
||||
// Show as MB
|
||||
format!("{:.0}MB", size_mb)
|
||||
} else if size_mb >= 0.001 {
|
||||
// Convert to kB
|
||||
let size_kb = size_mb * 1024.0;
|
||||
format!("{:.0}kB", size_kb)
|
||||
} else {
|
||||
// Show very small sizes in bytes
|
||||
let size_bytes = size_mb * 1024.0 * 1024.0;
|
||||
format!("{:.0}B", size_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
/// Format /tmp usage as "xx% yyykB/MB/GB/zzzGB"
|
||||
fn format_tmp_usage(&self) -> String {
|
||||
match (self.tmp_usage_percent, self.tmp_size_mb, self.tmp_total_mb) {
|
||||
(Some(percent), Some(used_mb), Some(total_mb)) => {
|
||||
let used_str = Self::format_size_units(used_mb);
|
||||
let total_str = Self::format_size_units(total_mb);
|
||||
format!("{:.1}% {}/{}", percent, used_str, total_str)
|
||||
}
|
||||
(Some(percent), Some(used_mb), None) => {
|
||||
let used_str = Self::format_size_units(used_mb);
|
||||
format!("{:.1}% {}", percent, used_str)
|
||||
}
|
||||
(None, Some(used_mb), Some(total_mb)) => {
|
||||
let used_str = Self::format_size_units(used_mb);
|
||||
let total_str = Self::format_size_units(total_mb);
|
||||
format!("{}/{}", used_str, total_str)
|
||||
}
|
||||
(None, Some(used_mb), None) => {
|
||||
Self::format_size_units(used_mb)
|
||||
}
|
||||
_ => "—".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Get tmp status based on usage percentage
|
||||
fn get_tmp_status(&self) -> Status {
|
||||
if let Some(tmp_percent) = self.tmp_usage_percent {
|
||||
if tmp_percent >= 90.0 {
|
||||
Status::Critical
|
||||
} else if tmp_percent >= 70.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
} else {
|
||||
Status::Unknown
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Widget for MemoryWidget {
|
||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||
debug!("Memory widget updating with {} metrics", metrics.len());
|
||||
|
||||
// Reset status aggregation
|
||||
let mut statuses = Vec::new();
|
||||
|
||||
for metric in metrics {
|
||||
match metric.name.as_str() {
|
||||
"memory_usage_percent" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.usage_percent = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"memory_total_gb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.total_gb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"memory_used_gb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.used_gb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"memory_available_gb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.available_gb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"memory_swap_total_gb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.swap_total_gb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"memory_swap_used_gb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.swap_used_gb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"disk_tmp_size_mb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.tmp_size_mb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"disk_tmp_total_mb" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.tmp_total_mb = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
"disk_tmp_usage_percent" => {
|
||||
if let Some(value) = metric.value.as_f32() {
|
||||
self.tmp_usage_percent = Some(value);
|
||||
statuses.push(metric.status);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregate status
|
||||
self.status = if statuses.is_empty() {
|
||||
Status::Unknown
|
||||
} else {
|
||||
Status::aggregate(&statuses)
|
||||
};
|
||||
|
||||
self.has_data = !metrics.is_empty();
|
||||
|
||||
debug!("Memory widget updated: usage={:?}%, total={:?}GB, swap_total={:?}GB, tmp={:?}/{:?}MB, status={:?}",
|
||||
self.usage_percent, self.total_gb, self.swap_total_gb, self.tmp_size_mb, self.tmp_total_mb, self.status);
|
||||
}
|
||||
|
||||
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||
let content_chunks = Layout::default().direction(Direction::Vertical).constraints([Constraint::Length(1), Constraint::Length(1), Constraint::Length(1)]).split(area);
|
||||
let mem_title = Paragraph::new("RAM:").style(Typography::widget_title());
|
||||
frame.render_widget(mem_title, content_chunks[0]);
|
||||
|
||||
// Format used and total memory with smart units, percentage, and status icon
|
||||
let used_str = self.used_gb.map_or("—".to_string(), |v| Self::format_size_units(v * 1024.0)); // Convert GB to MB for formatting
|
||||
let total_str = self.total_gb.map_or("—".to_string(), |v| Self::format_size_units(v * 1024.0)); // Convert GB to MB for formatting
|
||||
let percentage = self.get_memory_percentage();
|
||||
let mem_details_spans = StatusIcons::create_status_spans(self.status, &format!("Used: {}% {}/{}", percentage, used_str, total_str));
|
||||
let mem_details_para = Paragraph::new(ratatui::text::Line::from(mem_details_spans));
|
||||
frame.render_widget(mem_details_para, content_chunks[1]);
|
||||
|
||||
// /tmp usage line with status icon
|
||||
let tmp_status = self.get_tmp_status();
|
||||
let tmp_spans = StatusIcons::create_status_spans(tmp_status, &format!("tmp: {}", self.format_tmp_usage()));
|
||||
let tmp_para = Paragraph::new(ratatui::text::Line::from(tmp_spans));
|
||||
frame.render_widget(tmp_para, content_chunks[2]);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MemoryWidget {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
@@ -1,21 +1,13 @@
|
||||
use cm_dashboard_shared::Metric;
|
||||
use ratatui::{layout::Rect, Frame};
|
||||
use cm_dashboard_shared::AgentData;
|
||||
|
||||
pub mod cpu;
|
||||
pub mod memory;
|
||||
pub mod services;
|
||||
pub mod backup;
|
||||
pub mod system;
|
||||
|
||||
pub use cpu::CpuWidget;
|
||||
pub use memory::MemoryWidget;
|
||||
pub use services::ServicesWidget;
|
||||
pub use backup::BackupWidget;
|
||||
pub use system::SystemWidget;
|
||||
|
||||
/// Widget trait for UI components that display metrics
|
||||
/// Widget trait for UI components that display structured data
|
||||
pub trait Widget {
|
||||
/// Update widget with new metrics data
|
||||
fn update_from_metrics(&mut self, metrics: &[&Metric]);
|
||||
|
||||
/// Render the widget to a terminal frame
|
||||
fn render(&mut self, frame: &mut Frame, area: Rect);
|
||||
}
|
||||
/// Update widget with structured agent data
|
||||
fn update_from_agent_data(&mut self, agent_data: &AgentData);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use cm_dashboard_shared::{Metric, Status};
|
||||
use super::Widget;
|
||||
use ratatui::{
|
||||
layout::{Constraint, Direction, Layout, Rect},
|
||||
widgets::Paragraph,
|
||||
@@ -7,8 +8,7 @@ use ratatui::{
|
||||
use std::collections::HashMap;
|
||||
use tracing::debug;
|
||||
|
||||
use super::Widget;
|
||||
use crate::ui::theme::{Theme, Typography, Components, StatusIcons};
|
||||
use crate::ui::theme::{Components, StatusIcons, Theme, Typography};
|
||||
use ratatui::style::Style;
|
||||
|
||||
/// Services widget displaying hierarchical systemd service statuses
|
||||
@@ -22,14 +22,15 @@ pub struct ServicesWidget {
|
||||
status: Status,
|
||||
/// Last update indicator
|
||||
has_data: bool,
|
||||
/// Currently selected service index (for navigation cursor)
|
||||
selected_index: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ServiceInfo {
|
||||
status: String,
|
||||
memory_mb: Option<f32>,
|
||||
disk_gb: Option<f32>,
|
||||
latency_ms: Option<f32>,
|
||||
metrics: Vec<(String, f32, Option<String>)>, // (label, value, unit)
|
||||
widget_status: Status,
|
||||
}
|
||||
|
||||
@@ -40,18 +41,22 @@ impl ServicesWidget {
|
||||
sub_services: HashMap::new(),
|
||||
status: Status::Unknown,
|
||||
has_data: false,
|
||||
selected_index: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Extract service name and determine if it's a parent or sub-service
|
||||
#[allow(dead_code)]
|
||||
fn extract_service_info(metric_name: &str) -> Option<(String, Option<String>)> {
|
||||
if metric_name.starts_with("service_") {
|
||||
if let Some(end_pos) = metric_name.rfind("_status")
|
||||
if let Some(end_pos) = metric_name
|
||||
.rfind("_status")
|
||||
.or_else(|| metric_name.rfind("_memory_mb"))
|
||||
.or_else(|| metric_name.rfind("_disk_gb"))
|
||||
.or_else(|| metric_name.rfind("_latency_ms")) {
|
||||
.or_else(|| metric_name.rfind("_latency_ms"))
|
||||
{
|
||||
let service_part = &metric_name[8..end_pos]; // Remove "service_" prefix
|
||||
|
||||
|
||||
// Check for sub-services patterns
|
||||
if service_part.starts_with("nginx_") {
|
||||
// nginx sub-services: service_nginx_gitea_latency_ms -> ("nginx", "gitea")
|
||||
@@ -69,11 +74,11 @@ impl ServicesWidget {
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
/// Format disk size with appropriate units (kB/MB/GB)
|
||||
fn format_disk_size(size_gb: f32) -> String {
|
||||
let size_mb = size_gb * 1024.0; // Convert GB to MB
|
||||
|
||||
|
||||
if size_mb >= 1024.0 {
|
||||
// Show as GB
|
||||
format!("{:.1}GB", size_gb)
|
||||
@@ -93,113 +98,253 @@ impl ServicesWidget {
|
||||
|
||||
/// Format parent service line - returns text without icon for span formatting
|
||||
fn format_parent_service_line(&self, name: &str, info: &ServiceInfo) -> String {
|
||||
let memory_str = info.memory_mb.map_or("0M".to_string(), |m| format!("{:.0}M", m));
|
||||
let disk_str = info.disk_gb.map_or("0".to_string(), |d| Self::format_disk_size(d));
|
||||
|
||||
let memory_str = info
|
||||
.memory_mb
|
||||
.map_or("0M".to_string(), |m| format!("{:.0}M", m));
|
||||
let disk_str = info
|
||||
.disk_gb
|
||||
.map_or("0".to_string(), |d| Self::format_disk_size(d));
|
||||
|
||||
// Truncate long service names to fit layout (account for icon space)
|
||||
let short_name = if name.len() > 22 {
|
||||
format!("{}...", &name[..19])
|
||||
} else {
|
||||
name.to_string()
|
||||
};
|
||||
|
||||
// Parent services always show active/inactive status
|
||||
|
||||
// Convert Status enum to display text
|
||||
let status_str = match info.widget_status {
|
||||
Status::Ok => "active".to_string(),
|
||||
Status::Warning => "inactive".to_string(),
|
||||
Status::Critical => "failed".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
Status::Ok => "active",
|
||||
Status::Inactive => "inactive",
|
||||
Status::Critical => "failed",
|
||||
Status::Pending => "pending",
|
||||
Status::Warning => "warning",
|
||||
Status::Unknown => "unknown",
|
||||
Status::Offline => "offline",
|
||||
};
|
||||
|
||||
format!("{:<24} {:<10} {:<8} {:<8}",
|
||||
short_name,
|
||||
status_str,
|
||||
memory_str,
|
||||
disk_str)
|
||||
|
||||
format!(
|
||||
"{:<23} {:<10} {:<8} {:<8}",
|
||||
short_name, status_str, memory_str, disk_str
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/// Create spans for sub-service with icon next to name
|
||||
fn create_sub_service_spans(&self, name: &str, info: &ServiceInfo) -> Vec<ratatui::text::Span<'static>> {
|
||||
fn create_sub_service_spans(
|
||||
&self,
|
||||
name: &str,
|
||||
info: &ServiceInfo,
|
||||
is_last: bool,
|
||||
) -> Vec<ratatui::text::Span<'static>> {
|
||||
// Truncate long sub-service names to fit layout (accounting for indentation)
|
||||
let short_name = if name.len() > 18 {
|
||||
format!("{}...", &name[..15])
|
||||
} else {
|
||||
name.to_string()
|
||||
};
|
||||
|
||||
// Sub-services show latency if available, otherwise status
|
||||
let status_str = if let Some(latency) = info.latency_ms {
|
||||
if latency < 0.0 {
|
||||
"timeout".to_string()
|
||||
} else {
|
||||
format!("{:.0}ms", latency)
|
||||
}
|
||||
} else {
|
||||
match info.widget_status {
|
||||
Status::Ok => "active".to_string(),
|
||||
Status::Warning => "inactive".to_string(),
|
||||
Status::Critical => "failed".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Get status icon and text
|
||||
let icon = StatusIcons::get_icon(info.widget_status);
|
||||
let status_color = match info.widget_status {
|
||||
Status::Ok => Theme::success(),
|
||||
Status::Inactive => Theme::muted_text(),
|
||||
Status::Pending => Theme::highlight(),
|
||||
Status::Warning => Theme::warning(),
|
||||
Status::Critical => Theme::error(),
|
||||
Status::Unknown => Theme::muted_text(),
|
||||
Status::Offline => Theme::muted_text(),
|
||||
};
|
||||
|
||||
let icon = StatusIcons::get_icon(info.widget_status);
|
||||
|
||||
|
||||
// Display metrics or status for sub-services
|
||||
let status_str = if !info.metrics.is_empty() {
|
||||
// Show first metric with label and unit
|
||||
let (label, value, unit) = &info.metrics[0];
|
||||
match unit {
|
||||
Some(u) => format!("{}: {:.1} {}", label, value, u),
|
||||
None => format!("{}: {:.1}", label, value),
|
||||
}
|
||||
} else {
|
||||
// Convert Status enum to display text for sub-services
|
||||
match info.widget_status {
|
||||
Status::Ok => "active",
|
||||
Status::Inactive => "inactive",
|
||||
Status::Critical => "failed",
|
||||
Status::Pending => "pending",
|
||||
Status::Warning => "warning",
|
||||
Status::Unknown => "unknown",
|
||||
Status::Offline => "offline",
|
||||
}.to_string()
|
||||
};
|
||||
let tree_symbol = if is_last { "└─" } else { "├─" };
|
||||
|
||||
vec![
|
||||
// Indentation and tree prefix
|
||||
ratatui::text::Span::styled(
|
||||
" ├─ ".to_string(),
|
||||
Style::default().fg(Theme::secondary_text()).bg(Theme::background())
|
||||
format!(" {} ", tree_symbol),
|
||||
Typography::tree(),
|
||||
),
|
||||
// Status icon
|
||||
ratatui::text::Span::styled(
|
||||
format!("{} ", icon),
|
||||
Style::default().fg(status_color).bg(Theme::background())
|
||||
Style::default().fg(status_color).bg(Theme::background()),
|
||||
),
|
||||
// Service name
|
||||
ratatui::text::Span::styled(
|
||||
format!("{:<18} ", short_name),
|
||||
Style::default().fg(Theme::secondary_text()).bg(Theme::background())
|
||||
Style::default()
|
||||
.fg(Theme::secondary_text())
|
||||
.bg(Theme::background()),
|
||||
),
|
||||
// Status/latency text
|
||||
ratatui::text::Span::styled(
|
||||
status_str,
|
||||
Style::default().fg(Theme::secondary_text()).bg(Theme::background())
|
||||
Style::default()
|
||||
.fg(Theme::secondary_text())
|
||||
.bg(Theme::background()),
|
||||
),
|
||||
]
|
||||
}
|
||||
|
||||
/// Move selection up
|
||||
pub fn select_previous(&mut self) {
|
||||
if self.selected_index > 0 {
|
||||
self.selected_index -= 1;
|
||||
}
|
||||
debug!("Service selection moved up to: {}", self.selected_index);
|
||||
}
|
||||
|
||||
/// Move selection down
|
||||
pub fn select_next(&mut self, total_services: usize) {
|
||||
if total_services > 0 && self.selected_index < total_services.saturating_sub(1) {
|
||||
self.selected_index += 1;
|
||||
}
|
||||
debug!("Service selection: {}/{}", self.selected_index, total_services);
|
||||
}
|
||||
|
||||
/// Get currently selected service name (for actions)
|
||||
/// Only returns parent service names since only parent services can be selected
|
||||
pub fn get_selected_service(&self) -> Option<String> {
|
||||
// Only parent services can be selected, so just get the parent service at selected_index
|
||||
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
parent_services.get(self.selected_index).map(|(name, _)| name.to_string())
|
||||
}
|
||||
|
||||
/// Get total count of selectable services (parent services only, not sub-services)
|
||||
pub fn get_total_services_count(&self) -> usize {
|
||||
// Only count parent services - sub-services are not selectable
|
||||
self.parent_services.len()
|
||||
}
|
||||
|
||||
|
||||
/// Calculate which parent service index corresponds to a display line index
|
||||
fn calculate_parent_service_index(&self, display_line_index: &usize) -> usize {
|
||||
// Build the same display list to map line index to parent service index
|
||||
let mut parent_index = 0;
|
||||
let mut line_index = 0;
|
||||
|
||||
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
for (parent_name, _) in parent_services {
|
||||
if line_index == *display_line_index {
|
||||
return parent_index;
|
||||
}
|
||||
line_index += 1; // Parent service line
|
||||
|
||||
// Skip sub-services but count them in line_index
|
||||
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||
line_index += sub_list.len();
|
||||
}
|
||||
|
||||
parent_index += 1;
|
||||
}
|
||||
|
||||
// If we get here, the display_line_index was probably for a sub-service
|
||||
// Return the last valid parent index (should not happen with our logic)
|
||||
parent_index.saturating_sub(1)
|
||||
}
|
||||
}
|
||||
|
||||
impl Widget for ServicesWidget {
|
||||
fn update_from_agent_data(&mut self, agent_data: &cm_dashboard_shared::AgentData) {
|
||||
self.has_data = true;
|
||||
self.parent_services.clear();
|
||||
self.sub_services.clear();
|
||||
|
||||
for service in &agent_data.services {
|
||||
// Store parent service
|
||||
let parent_info = ServiceInfo {
|
||||
memory_mb: Some(service.memory_mb),
|
||||
disk_gb: Some(service.disk_gb),
|
||||
metrics: Vec::new(), // Parent services don't have custom metrics
|
||||
widget_status: service.service_status,
|
||||
};
|
||||
self.parent_services.insert(service.name.clone(), parent_info);
|
||||
|
||||
// Process sub-services if any
|
||||
if !service.sub_services.is_empty() {
|
||||
let mut sub_list = Vec::new();
|
||||
for sub_service in &service.sub_services {
|
||||
// Convert metrics to display format
|
||||
let metrics: Vec<(String, f32, Option<String>)> = sub_service.metrics.iter()
|
||||
.map(|m| (m.label.clone(), m.value, m.unit.clone()))
|
||||
.collect();
|
||||
|
||||
let sub_info = ServiceInfo {
|
||||
memory_mb: None, // Not used for sub-services
|
||||
disk_gb: None, // Not used for sub-services
|
||||
metrics,
|
||||
widget_status: sub_service.service_status,
|
||||
};
|
||||
sub_list.push((sub_service.name.clone(), sub_info));
|
||||
}
|
||||
self.sub_services.insert(service.name.clone(), sub_list);
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregate status from all services
|
||||
let mut all_statuses = Vec::new();
|
||||
all_statuses.extend(self.parent_services.values().map(|info| info.widget_status));
|
||||
for sub_list in self.sub_services.values() {
|
||||
all_statuses.extend(sub_list.iter().map(|(_, info)| info.widget_status));
|
||||
}
|
||||
|
||||
self.status = if all_statuses.is_empty() {
|
||||
Status::Unknown
|
||||
} else {
|
||||
Status::aggregate(&all_statuses)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl ServicesWidget {
|
||||
#[allow(dead_code)]
|
||||
fn update_from_metrics(&mut self, metrics: &[&Metric]) {
|
||||
debug!("Services widget updating with {} metrics", metrics.len());
|
||||
|
||||
|
||||
// Don't clear existing services - preserve data between metric batches
|
||||
|
||||
|
||||
// Process individual service metrics
|
||||
for metric in metrics {
|
||||
if let Some((parent_service, sub_service)) = Self::extract_service_info(&metric.name) {
|
||||
match sub_service {
|
||||
None => {
|
||||
// Parent service metric
|
||||
let service_info = self.parent_services.entry(parent_service).or_insert(ServiceInfo {
|
||||
status: "unknown".to_string(),
|
||||
memory_mb: None,
|
||||
disk_gb: None,
|
||||
latency_ms: None,
|
||||
widget_status: Status::Unknown,
|
||||
});
|
||||
|
||||
let service_info =
|
||||
self.parent_services
|
||||
.entry(parent_service)
|
||||
.or_insert(ServiceInfo {
|
||||
memory_mb: None,
|
||||
disk_gb: None,
|
||||
metrics: Vec::new(),
|
||||
widget_status: Status::Unknown,
|
||||
});
|
||||
|
||||
if metric.name.ends_with("_status") {
|
||||
service_info.status = metric.value.as_string();
|
||||
service_info.widget_status = metric.status;
|
||||
} else if metric.name.ends_with("_memory_mb") {
|
||||
if let Some(memory) = metric.value.as_f32() {
|
||||
@@ -213,24 +358,31 @@ impl Widget for ServicesWidget {
|
||||
}
|
||||
Some(sub_name) => {
|
||||
// Sub-service metric
|
||||
let sub_service_list = self.sub_services.entry(parent_service).or_insert_with(Vec::new);
|
||||
|
||||
let sub_service_list = self
|
||||
.sub_services
|
||||
.entry(parent_service)
|
||||
.or_insert_with(Vec::new);
|
||||
|
||||
// Find existing sub-service or create new one
|
||||
let sub_service_info = if let Some(pos) = sub_service_list.iter().position(|(name, _)| name == &sub_name) {
|
||||
let sub_service_info = if let Some(pos) = sub_service_list
|
||||
.iter()
|
||||
.position(|(name, _)| name == &sub_name)
|
||||
{
|
||||
&mut sub_service_list[pos].1
|
||||
} else {
|
||||
sub_service_list.push((sub_name.clone(), ServiceInfo {
|
||||
status: "unknown".to_string(),
|
||||
memory_mb: None,
|
||||
disk_gb: None,
|
||||
latency_ms: None,
|
||||
widget_status: Status::Unknown,
|
||||
}));
|
||||
sub_service_list.push((
|
||||
sub_name.clone(),
|
||||
ServiceInfo {
|
||||
memory_mb: None,
|
||||
disk_gb: None,
|
||||
metrics: Vec::new(),
|
||||
widget_status: Status::Unknown,
|
||||
},
|
||||
));
|
||||
&mut sub_service_list.last_mut().unwrap().1
|
||||
};
|
||||
|
||||
|
||||
if metric.name.ends_with("_status") {
|
||||
sub_service_info.status = metric.value.as_string();
|
||||
sub_service_info.widget_status = metric.status;
|
||||
} else if metric.name.ends_with("_memory_mb") {
|
||||
if let Some(memory) = metric.value.as_f32() {
|
||||
@@ -240,124 +392,192 @@ impl Widget for ServicesWidget {
|
||||
if let Some(disk) = metric.value.as_f32() {
|
||||
sub_service_info.disk_gb = Some(disk);
|
||||
}
|
||||
} else if metric.name.ends_with("_latency_ms") {
|
||||
if let Some(latency) = metric.value.as_f32() {
|
||||
sub_service_info.latency_ms = Some(latency);
|
||||
sub_service_info.widget_status = metric.status;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Aggregate status from all parent and sub-services
|
||||
let mut all_statuses = Vec::new();
|
||||
|
||||
|
||||
// Add parent service statuses
|
||||
all_statuses.extend(self.parent_services.values().map(|info| info.widget_status));
|
||||
|
||||
|
||||
// Add sub-service statuses
|
||||
for sub_list in self.sub_services.values() {
|
||||
all_statuses.extend(sub_list.iter().map(|(_, info)| info.widget_status));
|
||||
}
|
||||
|
||||
|
||||
self.status = if all_statuses.is_empty() {
|
||||
Status::Unknown
|
||||
} else {
|
||||
Status::aggregate(&all_statuses)
|
||||
};
|
||||
|
||||
|
||||
self.has_data = !self.parent_services.is_empty() || !self.sub_services.is_empty();
|
||||
|
||||
debug!("Services widget updated: {} parent services, {} sub-service groups, status={:?}",
|
||||
self.parent_services.len(), self.sub_services.len(), self.status);
|
||||
|
||||
// Ensure selection index is within bounds after update
|
||||
let total_count = self.get_total_services_count();
|
||||
if self.selected_index >= total_count && total_count > 0 {
|
||||
self.selected_index = total_count - 1;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Services widget updated: {} parent services, {} sub-service groups, total={}, selected={}, status={:?}",
|
||||
self.parent_services.len(),
|
||||
self.sub_services.len(),
|
||||
total_count,
|
||||
self.selected_index,
|
||||
self.status
|
||||
);
|
||||
}
|
||||
|
||||
fn render(&mut self, frame: &mut Frame, area: Rect) {
|
||||
|
||||
}
|
||||
|
||||
impl ServicesWidget {
|
||||
|
||||
/// Render with focus
|
||||
pub fn render(&mut self, frame: &mut Frame, area: Rect, is_focused: bool) {
|
||||
let services_block = Components::widget_block("services");
|
||||
let inner_area = services_block.inner(area);
|
||||
frame.render_widget(services_block, area);
|
||||
|
||||
|
||||
let content_chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([Constraint::Length(1), Constraint::Min(0)])
|
||||
.split(inner_area);
|
||||
|
||||
|
||||
// Header
|
||||
let header = format!("{:<25} {:<10} {:<8} {:<8}", "Service:", "Status:", "RAM:", "Disk:");
|
||||
let header = format!(
|
||||
"{:<25} {:<10} {:<8} {:<8}",
|
||||
"Service:", "Status:", "RAM:", "Disk:"
|
||||
);
|
||||
let header_para = Paragraph::new(header).style(Typography::muted());
|
||||
frame.render_widget(header_para, content_chunks[0]);
|
||||
|
||||
|
||||
// Check if we have any services to display
|
||||
if self.parent_services.is_empty() && self.sub_services.is_empty() {
|
||||
let empty_text = Paragraph::new("No process data").style(Typography::muted());
|
||||
frame.render_widget(empty_text, content_chunks[1]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Render the services list
|
||||
self.render_services(frame, content_chunks[1], is_focused);
|
||||
}
|
||||
|
||||
/// Render services list
|
||||
fn render_services(&mut self, frame: &mut Frame, area: Rect, is_focused: bool) {
|
||||
// Build hierarchical service list for display
|
||||
let mut display_lines = Vec::new();
|
||||
|
||||
let mut display_lines: Vec<(String, Status, bool, Option<(ServiceInfo, bool)>)> = Vec::new();
|
||||
|
||||
// Sort parent services alphabetically for consistent order
|
||||
let mut parent_services: Vec<_> = self.parent_services.iter().collect();
|
||||
parent_services.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
|
||||
for (parent_name, parent_info) in parent_services {
|
||||
// Add parent service line
|
||||
let parent_line = self.format_parent_service_line(parent_name, parent_info);
|
||||
display_lines.push((parent_line, parent_info.widget_status, false, None)); // false = not sub-service
|
||||
|
||||
display_lines.push((parent_line, parent_info.widget_status, false, None));
|
||||
|
||||
// Add sub-services for this parent (if any)
|
||||
if let Some(sub_list) = self.sub_services.get(parent_name) {
|
||||
// Sort sub-services by name for consistent display
|
||||
let mut sorted_subs = sub_list.clone();
|
||||
sorted_subs.sort_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
for (sub_name, sub_info) in sorted_subs {
|
||||
|
||||
for (i, (sub_name, sub_info)) in sorted_subs.iter().enumerate() {
|
||||
let is_last_sub = i == sorted_subs.len() - 1;
|
||||
// Store sub-service info for custom span rendering
|
||||
display_lines.push((sub_name.clone(), sub_info.widget_status, true, Some(sub_info.clone()))); // true = sub-service
|
||||
display_lines.push((
|
||||
sub_name.clone(),
|
||||
sub_info.widget_status,
|
||||
true,
|
||||
Some((sub_info.clone(), is_last_sub)),
|
||||
)); // true = sub-service, with is_last info
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Show only what fits, with "X more below" if needed
|
||||
let available_lines = area.height as usize;
|
||||
let total_lines = display_lines.len();
|
||||
|
||||
// Render all lines within available space
|
||||
let available_lines = content_chunks[1].height as usize;
|
||||
let lines_to_show = available_lines.min(display_lines.len());
|
||||
// Reserve one line for "X more below" if needed
|
||||
let lines_for_content = if total_lines > available_lines {
|
||||
available_lines.saturating_sub(1)
|
||||
} else {
|
||||
available_lines
|
||||
};
|
||||
|
||||
let visible_lines: Vec<_> = display_lines
|
||||
.iter()
|
||||
.take(lines_for_content)
|
||||
.collect();
|
||||
|
||||
let hidden_below = total_lines.saturating_sub(lines_for_content);
|
||||
|
||||
let lines_to_show = visible_lines.len();
|
||||
|
||||
if lines_to_show > 0 {
|
||||
// Add space for "X more below" message if needed
|
||||
let total_chunks_needed = if hidden_below > 0 { lines_to_show + 1 } else { lines_to_show };
|
||||
let service_chunks = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints(vec![Constraint::Length(1); lines_to_show])
|
||||
.split(content_chunks[1]);
|
||||
|
||||
for (i, (line_text, line_status, is_sub, sub_info)) in display_lines.iter().take(lines_to_show).enumerate() {
|
||||
let spans = if *is_sub && sub_info.is_some() {
|
||||
// Use custom sub-service span creation
|
||||
self.create_sub_service_spans(line_text, sub_info.as_ref().unwrap())
|
||||
.constraints(vec![Constraint::Length(1); total_chunks_needed])
|
||||
.split(area);
|
||||
|
||||
for (i, (line_text, line_status, is_sub, sub_info)) in visible_lines.iter().enumerate()
|
||||
{
|
||||
let actual_index = i; // Simple index since we're not scrolling
|
||||
|
||||
// Only parent services can be selected - calculate parent service index
|
||||
let is_selected = if !*is_sub {
|
||||
// This is a parent service - count how many parent services came before this one
|
||||
let parent_index = self.calculate_parent_service_index(&actual_index);
|
||||
parent_index == self.selected_index
|
||||
} else {
|
||||
// Use regular status spans for parent services
|
||||
StatusIcons::create_status_spans(*line_status, line_text)
|
||||
};
|
||||
let service_para = Paragraph::new(ratatui::text::Line::from(spans));
|
||||
frame.render_widget(service_para, service_chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Show indicator if there are more services than we can display
|
||||
if display_lines.len() > available_lines {
|
||||
let more_count = display_lines.len() - available_lines;
|
||||
if available_lines > 0 {
|
||||
let last_line_area = Rect {
|
||||
x: content_chunks[1].x,
|
||||
y: content_chunks[1].y + (available_lines - 1) as u16,
|
||||
width: content_chunks[1].width,
|
||||
height: 1,
|
||||
false // Sub-services are never selected
|
||||
};
|
||||
|
||||
let more_text = format!("... and {} more services", more_count);
|
||||
let mut spans = if *is_sub && sub_info.is_some() {
|
||||
// Use custom sub-service span creation
|
||||
let (service_info, is_last) = sub_info.as_ref().unwrap();
|
||||
self.create_sub_service_spans(line_text, service_info, *is_last)
|
||||
} else {
|
||||
// Parent services - use normal status spans
|
||||
StatusIcons::create_status_spans(*line_status, line_text)
|
||||
};
|
||||
|
||||
// Apply selection highlighting to parent services only
|
||||
// Only show selection when Services panel is focused
|
||||
if is_selected && !*is_sub && is_focused {
|
||||
for (i, span) in spans.iter_mut().enumerate() {
|
||||
if i == 0 {
|
||||
// First span is the status icon - use background color for visibility against blue selection
|
||||
span.style = span.style
|
||||
.bg(Theme::highlight())
|
||||
.fg(Theme::background());
|
||||
} else {
|
||||
// Other spans (text) get full selection highlighting
|
||||
span.style = span.style
|
||||
.bg(Theme::highlight())
|
||||
.fg(Theme::background());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let service_para = Paragraph::new(ratatui::text::Line::from(spans));
|
||||
|
||||
frame.render_widget(service_para, service_chunks[i]);
|
||||
}
|
||||
|
||||
// Show "X more below" message if content was truncated
|
||||
if hidden_below > 0 {
|
||||
let more_text = format!("... {} more below", hidden_below);
|
||||
let more_para = Paragraph::new(more_text).style(Typography::muted());
|
||||
frame.render_widget(more_para, last_line_area);
|
||||
frame.render_widget(more_para, service_chunks[lines_to_show]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -367,4 +587,4 @@ impl Default for ServicesWidget {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
910
dashboard/src/ui/widgets/system.rs
Normal file
910
dashboard/src/ui/widgets/system.rs
Normal file
@@ -0,0 +1,910 @@
|
||||
use cm_dashboard_shared::Status;
|
||||
use ratatui::{
|
||||
layout::Rect,
|
||||
text::{Line, Span, Text},
|
||||
widgets::Paragraph,
|
||||
Frame,
|
||||
};
|
||||
|
||||
use crate::ui::theme::{StatusIcons, Typography};
|
||||
|
||||
/// System widget displaying NixOS info, Network, CPU, RAM, and Storage in unified layout
|
||||
#[derive(Clone)]
|
||||
pub struct SystemWidget {
|
||||
// NixOS information
|
||||
nixos_build: Option<String>,
|
||||
agent_hash: Option<String>,
|
||||
|
||||
// Network interfaces
|
||||
network_interfaces: Vec<cm_dashboard_shared::NetworkInterfaceData>,
|
||||
|
||||
// CPU metrics
|
||||
cpu_load_1min: Option<f32>,
|
||||
cpu_load_5min: Option<f32>,
|
||||
cpu_load_15min: Option<f32>,
|
||||
cpu_frequency: Option<f32>,
|
||||
cpu_status: Status,
|
||||
|
||||
// Memory metrics
|
||||
memory_usage_percent: Option<f32>,
|
||||
memory_used_gb: Option<f32>,
|
||||
memory_total_gb: Option<f32>,
|
||||
tmp_usage_percent: Option<f32>,
|
||||
tmp_used_gb: Option<f32>,
|
||||
tmp_total_gb: Option<f32>,
|
||||
memory_status: Status,
|
||||
tmp_status: Status,
|
||||
/// All tmpfs mounts (for auto-discovery support)
|
||||
tmpfs_mounts: Vec<cm_dashboard_shared::TmpfsData>,
|
||||
|
||||
// Storage metrics (collected from disk metrics)
|
||||
storage_pools: Vec<StoragePool>,
|
||||
|
||||
// Backup metrics
|
||||
backup_status: String,
|
||||
backup_start_time_raw: Option<String>,
|
||||
backup_disk_serial: Option<String>,
|
||||
backup_disk_usage_percent: Option<f32>,
|
||||
backup_disk_used_gb: Option<f32>,
|
||||
backup_disk_total_gb: Option<f32>,
|
||||
backup_disk_wear_percent: Option<f32>,
|
||||
backup_disk_temperature: Option<f32>,
|
||||
backup_last_size_gb: Option<f32>,
|
||||
|
||||
// Overall status
|
||||
has_data: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct StoragePool {
|
||||
name: String,
|
||||
mount_point: String,
|
||||
pool_type: String, // "single", "mergerfs (2+1)", "RAID5 (3+1)", etc.
|
||||
drives: Vec<StorageDrive>, // For physical drives
|
||||
data_drives: Vec<StorageDrive>, // For MergerFS pools
|
||||
parity_drives: Vec<StorageDrive>, // For MergerFS pools
|
||||
filesystems: Vec<FileSystem>, // For physical drive pools: individual filesystem children
|
||||
usage_percent: Option<f32>,
|
||||
used_gb: Option<f32>,
|
||||
total_gb: Option<f32>,
|
||||
status: Status,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct StorageDrive {
|
||||
name: String,
|
||||
temperature: Option<f32>,
|
||||
wear_percent: Option<f32>,
|
||||
status: Status,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FileSystem {
|
||||
mount_point: String,
|
||||
usage_percent: Option<f32>,
|
||||
used_gb: Option<f32>,
|
||||
total_gb: Option<f32>,
|
||||
status: Status,
|
||||
}
|
||||
|
||||
impl SystemWidget {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
nixos_build: None,
|
||||
agent_hash: None,
|
||||
network_interfaces: Vec::new(),
|
||||
cpu_load_1min: None,
|
||||
cpu_load_5min: None,
|
||||
cpu_load_15min: None,
|
||||
cpu_frequency: None,
|
||||
cpu_status: Status::Unknown,
|
||||
memory_usage_percent: None,
|
||||
memory_used_gb: None,
|
||||
memory_total_gb: None,
|
||||
tmp_usage_percent: None,
|
||||
tmp_used_gb: None,
|
||||
tmp_total_gb: None,
|
||||
memory_status: Status::Unknown,
|
||||
tmp_status: Status::Unknown,
|
||||
tmpfs_mounts: Vec::new(),
|
||||
storage_pools: Vec::new(),
|
||||
backup_status: "unknown".to_string(),
|
||||
backup_start_time_raw: None,
|
||||
backup_disk_serial: None,
|
||||
backup_disk_usage_percent: None,
|
||||
backup_disk_used_gb: None,
|
||||
backup_disk_total_gb: None,
|
||||
backup_disk_wear_percent: None,
|
||||
backup_disk_temperature: None,
|
||||
backup_last_size_gb: None,
|
||||
has_data: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Format CPU load averages
|
||||
fn format_cpu_load(&self) -> String {
|
||||
match (self.cpu_load_1min, self.cpu_load_5min, self.cpu_load_15min) {
|
||||
(Some(l1), Some(l5), Some(l15)) => {
|
||||
format!("{:.2} {:.2} {:.2}", l1, l5, l15)
|
||||
}
|
||||
_ => "— — —".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format CPU frequency
|
||||
fn format_cpu_frequency(&self) -> String {
|
||||
match self.cpu_frequency {
|
||||
Some(freq) => format!("{:.0} MHz", freq),
|
||||
None => "— MHz".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format memory usage
|
||||
fn format_memory_usage(&self) -> String {
|
||||
match (self.memory_usage_percent, self.memory_used_gb, self.memory_total_gb) {
|
||||
(Some(pct), Some(used), Some(total)) => {
|
||||
format!("{:.0}% {:.1}GB/{:.1}GB", pct, used, total)
|
||||
}
|
||||
_ => "—% —GB/—GB".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get the current agent hash for rebuild completion detection
|
||||
pub fn _get_agent_hash(&self) -> Option<&String> {
|
||||
self.agent_hash.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
use super::Widget;
|
||||
|
||||
impl Widget for SystemWidget {
|
||||
fn update_from_agent_data(&mut self, agent_data: &cm_dashboard_shared::AgentData) {
|
||||
self.has_data = true;
|
||||
|
||||
// Extract agent version
|
||||
self.agent_hash = Some(agent_data.agent_version.clone());
|
||||
|
||||
// Extract build version
|
||||
self.nixos_build = agent_data.build_version.clone();
|
||||
|
||||
// Extract network interfaces
|
||||
self.network_interfaces = agent_data.system.network.interfaces.clone();
|
||||
|
||||
// Extract CPU data directly
|
||||
let cpu = &agent_data.system.cpu;
|
||||
self.cpu_load_1min = Some(cpu.load_1min);
|
||||
self.cpu_load_5min = Some(cpu.load_5min);
|
||||
self.cpu_load_15min = Some(cpu.load_15min);
|
||||
self.cpu_frequency = Some(cpu.frequency_mhz);
|
||||
self.cpu_status = Status::Ok;
|
||||
|
||||
// Extract memory data directly
|
||||
let memory = &agent_data.system.memory;
|
||||
self.memory_usage_percent = Some(memory.usage_percent);
|
||||
self.memory_used_gb = Some(memory.used_gb);
|
||||
self.memory_total_gb = Some(memory.total_gb);
|
||||
self.memory_status = Status::Ok;
|
||||
|
||||
// Store all tmpfs mounts for display
|
||||
self.tmpfs_mounts = memory.tmpfs.clone();
|
||||
|
||||
// Extract tmpfs data (maintain backward compatibility for /tmp)
|
||||
if let Some(tmp_data) = memory.tmpfs.iter().find(|t| t.mount == "/tmp") {
|
||||
self.tmp_usage_percent = Some(tmp_data.usage_percent);
|
||||
self.tmp_used_gb = Some(tmp_data.used_gb);
|
||||
self.tmp_total_gb = Some(tmp_data.total_gb);
|
||||
self.tmp_status = Status::Ok;
|
||||
}
|
||||
|
||||
// Convert storage data to internal format
|
||||
self.update_storage_from_agent_data(agent_data);
|
||||
|
||||
// Extract backup data
|
||||
let backup = &agent_data.backup;
|
||||
self.backup_status = backup.status.clone();
|
||||
self.backup_start_time_raw = backup.start_time_raw.clone();
|
||||
self.backup_last_size_gb = backup.last_backup_size_gb;
|
||||
|
||||
if let Some(disk) = &backup.repository_disk {
|
||||
self.backup_disk_serial = Some(disk.serial.clone());
|
||||
self.backup_disk_usage_percent = Some(disk.usage_percent);
|
||||
self.backup_disk_used_gb = Some(disk.used_gb);
|
||||
self.backup_disk_total_gb = Some(disk.total_gb);
|
||||
self.backup_disk_wear_percent = disk.wear_percent;
|
||||
self.backup_disk_temperature = disk.temperature_celsius;
|
||||
} else {
|
||||
self.backup_disk_serial = None;
|
||||
self.backup_disk_usage_percent = None;
|
||||
self.backup_disk_used_gb = None;
|
||||
self.backup_disk_total_gb = None;
|
||||
self.backup_disk_wear_percent = None;
|
||||
self.backup_disk_temperature = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SystemWidget {
|
||||
/// Convert structured storage data to internal format
|
||||
fn update_storage_from_agent_data(&mut self, agent_data: &cm_dashboard_shared::AgentData) {
|
||||
let mut pools: std::collections::HashMap<String, StoragePool> = std::collections::HashMap::new();
|
||||
|
||||
// Convert drives
|
||||
for drive in &agent_data.system.storage.drives {
|
||||
let mut pool = StoragePool {
|
||||
name: drive.name.clone(),
|
||||
mount_point: drive.name.clone(),
|
||||
pool_type: "drive".to_string(),
|
||||
drives: Vec::new(),
|
||||
data_drives: Vec::new(),
|
||||
parity_drives: Vec::new(),
|
||||
filesystems: Vec::new(),
|
||||
usage_percent: None,
|
||||
used_gb: None,
|
||||
total_gb: None,
|
||||
status: Status::Ok,
|
||||
};
|
||||
|
||||
// Add drive info
|
||||
let display_name = drive.serial_number.as_ref()
|
||||
.map(|s| truncate_serial(s))
|
||||
.unwrap_or(drive.name.clone());
|
||||
let storage_drive = StorageDrive {
|
||||
name: display_name,
|
||||
temperature: drive.temperature_celsius,
|
||||
wear_percent: drive.wear_percent,
|
||||
status: Status::Ok,
|
||||
};
|
||||
pool.drives.push(storage_drive);
|
||||
|
||||
// Calculate totals from filesystems
|
||||
let total_used: f32 = drive.filesystems.iter().map(|fs| fs.used_gb).sum();
|
||||
let total_size: f32 = drive.filesystems.iter().map(|fs| fs.total_gb).sum();
|
||||
let average_usage = if total_size > 0.0 { (total_used / total_size) * 100.0 } else { 0.0 };
|
||||
|
||||
pool.usage_percent = Some(average_usage);
|
||||
pool.used_gb = Some(total_used);
|
||||
pool.total_gb = Some(total_size);
|
||||
|
||||
// Add filesystems
|
||||
for fs in &drive.filesystems {
|
||||
let filesystem = FileSystem {
|
||||
mount_point: fs.mount.clone(),
|
||||
usage_percent: Some(fs.usage_percent),
|
||||
used_gb: Some(fs.used_gb),
|
||||
total_gb: Some(fs.total_gb),
|
||||
status: Status::Ok,
|
||||
};
|
||||
pool.filesystems.push(filesystem);
|
||||
}
|
||||
|
||||
pools.insert(drive.name.clone(), pool);
|
||||
}
|
||||
|
||||
// Convert pools (MergerFS, RAID, etc.)
|
||||
for pool in &agent_data.system.storage.pools {
|
||||
// Use agent-calculated status (combined health and usage status)
|
||||
let pool_status = if pool.health_status == Status::Critical || pool.usage_status == Status::Critical {
|
||||
Status::Critical
|
||||
} else if pool.health_status == Status::Warning || pool.usage_status == Status::Warning {
|
||||
Status::Warning
|
||||
} else if pool.health_status == Status::Ok && pool.usage_status == Status::Ok {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Unknown
|
||||
};
|
||||
|
||||
let mut storage_pool = StoragePool {
|
||||
name: pool.name.clone(),
|
||||
mount_point: pool.mount.clone(),
|
||||
pool_type: pool.pool_type.clone(),
|
||||
drives: Vec::new(),
|
||||
data_drives: Vec::new(),
|
||||
parity_drives: Vec::new(),
|
||||
filesystems: Vec::new(),
|
||||
usage_percent: Some(pool.usage_percent),
|
||||
used_gb: Some(pool.used_gb),
|
||||
total_gb: Some(pool.total_gb),
|
||||
status: pool_status,
|
||||
};
|
||||
|
||||
// Add data drives - use agent-calculated status
|
||||
for drive in &pool.data_drives {
|
||||
// Use combined health and temperature status
|
||||
let drive_status = if drive.health_status == Status::Critical || drive.temperature_status == Status::Critical {
|
||||
Status::Critical
|
||||
} else if drive.health_status == Status::Warning || drive.temperature_status == Status::Warning {
|
||||
Status::Warning
|
||||
} else if drive.health_status == Status::Ok && drive.temperature_status == Status::Ok {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Unknown
|
||||
};
|
||||
|
||||
let display_name = drive.serial_number.as_ref()
|
||||
.map(|s| truncate_serial(s))
|
||||
.unwrap_or(drive.name.clone());
|
||||
let storage_drive = StorageDrive {
|
||||
name: display_name,
|
||||
temperature: drive.temperature_celsius,
|
||||
wear_percent: drive.wear_percent,
|
||||
status: drive_status,
|
||||
};
|
||||
storage_pool.data_drives.push(storage_drive);
|
||||
}
|
||||
|
||||
// Add parity drives - use agent-calculated status
|
||||
for drive in &pool.parity_drives {
|
||||
// Use combined health and temperature status
|
||||
let drive_status = if drive.health_status == Status::Critical || drive.temperature_status == Status::Critical {
|
||||
Status::Critical
|
||||
} else if drive.health_status == Status::Warning || drive.temperature_status == Status::Warning {
|
||||
Status::Warning
|
||||
} else if drive.health_status == Status::Ok && drive.temperature_status == Status::Ok {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Unknown
|
||||
};
|
||||
|
||||
let display_name = drive.serial_number.as_ref()
|
||||
.map(|s| truncate_serial(s))
|
||||
.unwrap_or(drive.name.clone());
|
||||
let storage_drive = StorageDrive {
|
||||
name: display_name,
|
||||
temperature: drive.temperature_celsius,
|
||||
wear_percent: drive.wear_percent,
|
||||
status: drive_status,
|
||||
};
|
||||
storage_pool.parity_drives.push(storage_drive);
|
||||
}
|
||||
|
||||
pools.insert(pool.name.clone(), storage_pool);
|
||||
}
|
||||
|
||||
// Store pools
|
||||
let mut pool_list: Vec<StoragePool> = pools.into_values().collect();
|
||||
pool_list.sort_by(|a, b| a.name.cmp(&b.name));
|
||||
self.storage_pools = pool_list;
|
||||
}
|
||||
|
||||
/// Render storage section with enhanced tree structure
|
||||
fn render_storage(&self) -> Vec<Line<'_>> {
|
||||
let mut lines = Vec::new();
|
||||
|
||||
for pool in &self.storage_pools {
|
||||
// Pool header line with type and health
|
||||
let pool_label = if pool.pool_type == "drive" {
|
||||
// For physical drives, show the drive name with temperature and wear percentage if available
|
||||
// Physical drives only have one drive entry
|
||||
if let Some(drive) = pool.drives.first() {
|
||||
let mut drive_details = Vec::new();
|
||||
if let Some(temp) = drive.temperature {
|
||||
drive_details.push(format!("T: {}°C", temp as i32));
|
||||
}
|
||||
if let Some(wear) = drive.wear_percent {
|
||||
drive_details.push(format!("W: {}%", wear as i32));
|
||||
}
|
||||
|
||||
if !drive_details.is_empty() {
|
||||
format!("{} {}", drive.name, drive_details.join(" "))
|
||||
} else {
|
||||
drive.name.clone()
|
||||
}
|
||||
} else {
|
||||
pool.name.clone()
|
||||
}
|
||||
} else {
|
||||
// For mergerfs pools, show pool type with mount point
|
||||
format!("mergerfs {}:", pool.mount_point)
|
||||
};
|
||||
|
||||
let pool_spans = StatusIcons::create_status_spans(pool.status.clone(), &pool_label);
|
||||
lines.push(Line::from(pool_spans));
|
||||
|
||||
// Show individual filesystems for physical drives (matching CLAUDE.md format)
|
||||
if pool.pool_type == "drive" {
|
||||
// Show filesystem entries like: ├─ ● /: 55% 250.5GB/456.4GB
|
||||
for (i, filesystem) in pool.filesystems.iter().enumerate() {
|
||||
let is_last = i == pool.filesystems.len() - 1;
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
|
||||
let fs_text = format!("{}: {:.0}% {:.1}GB/{:.1}GB",
|
||||
filesystem.mount_point,
|
||||
filesystem.usage_percent.unwrap_or(0.0),
|
||||
filesystem.used_gb.unwrap_or(0.0),
|
||||
filesystem.total_gb.unwrap_or(0.0));
|
||||
|
||||
let mut fs_spans = vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
];
|
||||
fs_spans.extend(StatusIcons::create_status_spans(
|
||||
filesystem.status.clone(),
|
||||
&fs_text
|
||||
));
|
||||
lines.push(Line::from(fs_spans));
|
||||
}
|
||||
} else {
|
||||
// For mergerfs pools, show structure matching CLAUDE.md format:
|
||||
// ● mergerfs (2+1):
|
||||
// ├─ Total: ● 63% 2355.2GB/3686.4GB
|
||||
// ├─ Data Disks:
|
||||
// │ ├─ ● sdb T: 24°C W: 5%
|
||||
// │ └─ ● sdd T: 27°C W: 5%
|
||||
// ├─ Parity: ● sdc T: 24°C W: 5%
|
||||
// └─ Mount: /srv/media
|
||||
|
||||
// Pool total usage
|
||||
let total_text = format!("{:.0}% {:.1}GB/{:.1}GB",
|
||||
pool.usage_percent.unwrap_or(0.0),
|
||||
pool.used_gb.unwrap_or(0.0),
|
||||
pool.total_gb.unwrap_or(0.0)
|
||||
);
|
||||
let mut total_spans = vec![
|
||||
Span::styled(" ├─ ", Typography::tree()),
|
||||
];
|
||||
total_spans.extend(StatusIcons::create_status_spans(Status::Ok, &total_text));
|
||||
lines.push(Line::from(total_spans));
|
||||
|
||||
// Data drives - at same level as parity
|
||||
let has_parity = !pool.parity_drives.is_empty();
|
||||
for (i, drive) in pool.data_drives.iter().enumerate() {
|
||||
let is_last_data = i == pool.data_drives.len() - 1;
|
||||
let mut drive_details = Vec::new();
|
||||
if let Some(temp) = drive.temperature {
|
||||
drive_details.push(format!("T: {}°C", temp as i32));
|
||||
}
|
||||
if let Some(wear) = drive.wear_percent {
|
||||
drive_details.push(format!("W: {}%", wear as i32));
|
||||
}
|
||||
|
||||
let drive_text = if !drive_details.is_empty() {
|
||||
format!("Data_{}: {} {}", i + 1, drive.name, drive_details.join(" "))
|
||||
} else {
|
||||
format!("Data_{}: {}", i + 1, drive.name)
|
||||
};
|
||||
|
||||
// Last data drive uses └─ if there's no parity, otherwise ├─
|
||||
let tree_symbol = if is_last_data && !has_parity { " └─ " } else { " ├─ " };
|
||||
let mut data_spans = vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
];
|
||||
data_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
||||
lines.push(Line::from(data_spans));
|
||||
}
|
||||
|
||||
// Parity drives - last item(s)
|
||||
if !pool.parity_drives.is_empty() {
|
||||
for (i, drive) in pool.parity_drives.iter().enumerate() {
|
||||
let is_last = i == pool.parity_drives.len() - 1;
|
||||
let mut drive_details = Vec::new();
|
||||
if let Some(temp) = drive.temperature {
|
||||
drive_details.push(format!("T: {}°C", temp as i32));
|
||||
}
|
||||
if let Some(wear) = drive.wear_percent {
|
||||
drive_details.push(format!("W: {}%", wear as i32));
|
||||
}
|
||||
|
||||
let drive_text = if !drive_details.is_empty() {
|
||||
format!("Parity: {} {}", drive.name, drive_details.join(" "))
|
||||
} else {
|
||||
format!("Parity: {}", drive.name)
|
||||
};
|
||||
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
let mut parity_spans = vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
];
|
||||
parity_spans.extend(StatusIcons::create_status_spans(drive.status.clone(), &drive_text));
|
||||
lines.push(Line::from(parity_spans));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lines
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncate serial number to last 8 characters
|
||||
fn truncate_serial(serial: &str) -> String {
|
||||
let len = serial.len();
|
||||
if len > 8 {
|
||||
serial[len - 8..].to_string()
|
||||
} else {
|
||||
serial.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl SystemWidget {
|
||||
/// Render backup section for display
|
||||
fn render_backup(&self) -> Vec<Line<'_>> {
|
||||
let mut lines = Vec::new();
|
||||
|
||||
// First line: serial number with temperature and wear
|
||||
if let Some(serial) = &self.backup_disk_serial {
|
||||
let truncated_serial = truncate_serial(serial);
|
||||
let mut details = Vec::new();
|
||||
if let Some(temp) = self.backup_disk_temperature {
|
||||
details.push(format!("T: {}°C", temp as i32));
|
||||
}
|
||||
if let Some(wear) = self.backup_disk_wear_percent {
|
||||
details.push(format!("W: {}%", wear as i32));
|
||||
}
|
||||
|
||||
let disk_text = if !details.is_empty() {
|
||||
format!("{} {}", truncated_serial, details.join(" "))
|
||||
} else {
|
||||
truncated_serial
|
||||
};
|
||||
|
||||
let backup_status = match self.backup_status.as_str() {
|
||||
"completed" | "success" => Status::Ok,
|
||||
"running" => Status::Pending,
|
||||
"failed" => Status::Critical,
|
||||
_ => Status::Unknown,
|
||||
};
|
||||
|
||||
let disk_spans = StatusIcons::create_status_spans(backup_status, &disk_text);
|
||||
lines.push(Line::from(disk_spans));
|
||||
|
||||
// Show backup time from TOML if available
|
||||
if let Some(start_time) = &self.backup_start_time_raw {
|
||||
let time_text = if let Some(size) = self.backup_last_size_gb {
|
||||
format!("Time: {} ({:.1}GB)", start_time, size)
|
||||
} else {
|
||||
format!("Time: {}", start_time)
|
||||
};
|
||||
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(" ├─ ", Typography::tree()),
|
||||
Span::styled(time_text, Typography::secondary())
|
||||
]));
|
||||
}
|
||||
|
||||
// Usage information
|
||||
if let (Some(used), Some(total), Some(usage_percent)) = (
|
||||
self.backup_disk_used_gb,
|
||||
self.backup_disk_total_gb,
|
||||
self.backup_disk_usage_percent
|
||||
) {
|
||||
let usage_text = format!("Usage: {:.0}% {:.0}GB/{:.0}GB", usage_percent, used, total);
|
||||
let usage_spans = StatusIcons::create_status_spans(Status::Ok, &usage_text);
|
||||
let mut full_spans = vec![
|
||||
Span::styled(" └─ ", Typography::tree()),
|
||||
];
|
||||
full_spans.extend(usage_spans);
|
||||
lines.push(Line::from(full_spans));
|
||||
}
|
||||
}
|
||||
|
||||
lines
|
||||
}
|
||||
|
||||
/// Compress IPv4 addresses from same subnet
|
||||
/// Example: "192.168.30.1, 192.168.30.100" -> "192.168.30.1, 100"
|
||||
fn compress_ipv4_addresses(addresses: &[String]) -> String {
|
||||
if addresses.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
if addresses.len() == 1 {
|
||||
return addresses[0].clone();
|
||||
}
|
||||
|
||||
let mut result = Vec::new();
|
||||
let mut last_prefix = String::new();
|
||||
|
||||
for addr in addresses {
|
||||
let parts: Vec<&str> = addr.split('.').collect();
|
||||
if parts.len() == 4 {
|
||||
let prefix = format!("{}.{}.{}", parts[0], parts[1], parts[2]);
|
||||
|
||||
if prefix == last_prefix {
|
||||
// Same subnet, show only last octet
|
||||
result.push(parts[3].to_string());
|
||||
} else {
|
||||
// Different subnet, show full IP
|
||||
result.push(addr.clone());
|
||||
last_prefix = prefix;
|
||||
}
|
||||
} else {
|
||||
// Invalid IP format, show as-is
|
||||
result.push(addr.clone());
|
||||
}
|
||||
}
|
||||
|
||||
result.join(", ")
|
||||
}
|
||||
|
||||
/// Render network section for display with physical/virtual grouping
|
||||
fn render_network(&self) -> Vec<Line<'_>> {
|
||||
let mut lines = Vec::new();
|
||||
|
||||
if self.network_interfaces.is_empty() {
|
||||
return lines;
|
||||
}
|
||||
|
||||
// Separate physical and virtual interfaces
|
||||
let physical: Vec<_> = self.network_interfaces.iter().filter(|i| i.is_physical).collect();
|
||||
let virtual_interfaces: Vec<_> = self.network_interfaces.iter().filter(|i| !i.is_physical).collect();
|
||||
|
||||
// Find standalone virtual interfaces (those without a parent)
|
||||
let mut standalone_virtual: Vec<_> = virtual_interfaces.iter()
|
||||
.filter(|i| i.parent_interface.is_none())
|
||||
.collect();
|
||||
|
||||
// Sort standalone virtual: VLANs first (by VLAN ID), then others alphabetically
|
||||
standalone_virtual.sort_by(|a, b| {
|
||||
match (a.vlan_id, b.vlan_id) {
|
||||
(Some(vlan_a), Some(vlan_b)) => vlan_a.cmp(&vlan_b),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => a.name.cmp(&b.name),
|
||||
}
|
||||
});
|
||||
|
||||
// Render physical interfaces with their children
|
||||
for (phy_idx, interface) in physical.iter().enumerate() {
|
||||
let is_last_physical = phy_idx == physical.len() - 1 && standalone_virtual.is_empty();
|
||||
|
||||
// Physical interface header with status icon
|
||||
let mut header_spans = vec![];
|
||||
header_spans.extend(StatusIcons::create_status_spans(
|
||||
interface.link_status.clone(),
|
||||
&format!("{}:", interface.name)
|
||||
));
|
||||
lines.push(Line::from(header_spans));
|
||||
|
||||
// Find child interfaces for this physical interface
|
||||
let mut children: Vec<_> = virtual_interfaces.iter()
|
||||
.filter(|vi| {
|
||||
if let Some(parent) = &vi.parent_interface {
|
||||
parent == &interface.name
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort children: VLANs first (by VLAN ID), then others alphabetically
|
||||
children.sort_by(|a, b| {
|
||||
match (a.vlan_id, b.vlan_id) {
|
||||
(Some(vlan_a), Some(vlan_b)) => vlan_a.cmp(&vlan_b),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => a.name.cmp(&b.name),
|
||||
}
|
||||
});
|
||||
|
||||
// Count total items under this physical interface (IPs + children)
|
||||
let ip_count = interface.ipv4_addresses.len() + interface.ipv6_addresses.len();
|
||||
let total_children = ip_count + children.len();
|
||||
let mut child_index = 0;
|
||||
|
||||
// IPv4 addresses on the physical interface itself
|
||||
for ipv4 in &interface.ipv4_addresses {
|
||||
child_index += 1;
|
||||
let is_last = child_index == total_children && is_last_physical;
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
Span::styled(format!("ip: {}", ipv4), Typography::secondary()),
|
||||
]));
|
||||
}
|
||||
|
||||
// IPv6 addresses on the physical interface itself
|
||||
for ipv6 in &interface.ipv6_addresses {
|
||||
child_index += 1;
|
||||
let is_last = child_index == total_children && is_last_physical;
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
Span::styled(format!("ip: {}", ipv6), Typography::secondary()),
|
||||
]));
|
||||
}
|
||||
|
||||
// Child virtual interfaces (VLANs, etc.)
|
||||
for child in children {
|
||||
child_index += 1;
|
||||
let is_last = child_index == total_children && is_last_physical;
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
|
||||
let ip_text = if !child.ipv4_addresses.is_empty() {
|
||||
Self::compress_ipv4_addresses(&child.ipv4_addresses)
|
||||
} else if !child.ipv6_addresses.is_empty() {
|
||||
child.ipv6_addresses.join(", ")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
// Format: "name (vlan X): IP" or "name: IP"
|
||||
let child_text = if let Some(vlan_id) = child.vlan_id {
|
||||
if !ip_text.is_empty() {
|
||||
format!("{} (vlan {}): {}", child.name, vlan_id, ip_text)
|
||||
} else {
|
||||
format!("{} (vlan {}):", child.name, vlan_id)
|
||||
}
|
||||
} else {
|
||||
if !ip_text.is_empty() {
|
||||
format!("{}: {}", child.name, ip_text)
|
||||
} else {
|
||||
format!("{}:", child.name)
|
||||
}
|
||||
};
|
||||
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
Span::styled(child_text, Typography::secondary()),
|
||||
]));
|
||||
}
|
||||
}
|
||||
|
||||
// Render standalone virtual interfaces (those without a parent)
|
||||
for (virt_idx, interface) in standalone_virtual.iter().enumerate() {
|
||||
let is_last = virt_idx == standalone_virtual.len() - 1;
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
|
||||
// Virtual interface with IPs
|
||||
let ip_text = if !interface.ipv4_addresses.is_empty() {
|
||||
Self::compress_ipv4_addresses(&interface.ipv4_addresses)
|
||||
} else if !interface.ipv6_addresses.is_empty() {
|
||||
interface.ipv6_addresses.join(", ")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
// Format: "name (vlan X): IP" or "name: IP"
|
||||
let interface_text = if let Some(vlan_id) = interface.vlan_id {
|
||||
if !ip_text.is_empty() {
|
||||
format!("{} (vlan {}): {}", interface.name, vlan_id, ip_text)
|
||||
} else {
|
||||
format!("{} (vlan {}):", interface.name, vlan_id)
|
||||
}
|
||||
} else {
|
||||
if !ip_text.is_empty() {
|
||||
format!("{}: {}", interface.name, ip_text)
|
||||
} else {
|
||||
format!("{}:", interface.name)
|
||||
}
|
||||
};
|
||||
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
Span::styled(interface_text, Typography::secondary()),
|
||||
]));
|
||||
}
|
||||
|
||||
lines
|
||||
}
|
||||
|
||||
/// Render system widget
|
||||
pub fn render(&mut self, frame: &mut Frame, area: Rect, hostname: &str, _config: Option<&crate::config::DashboardConfig>) {
|
||||
let mut lines = Vec::new();
|
||||
|
||||
// NixOS section
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(format!("NixOS {}:", hostname), Typography::widget_title())
|
||||
]));
|
||||
|
||||
let build_text = self.nixos_build.as_deref().unwrap_or("unknown");
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(format!("Build: {}", build_text), Typography::secondary())
|
||||
]));
|
||||
|
||||
let agent_version_text = self.agent_hash.as_deref().unwrap_or("unknown");
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(format!("Agent: {}", agent_version_text), Typography::secondary())
|
||||
]));
|
||||
|
||||
// CPU section
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("CPU:", Typography::widget_title())
|
||||
]));
|
||||
|
||||
let load_text = self.format_cpu_load();
|
||||
let cpu_spans = StatusIcons::create_status_spans(
|
||||
self.cpu_status.clone(),
|
||||
&format!("Load: {}", load_text)
|
||||
);
|
||||
lines.push(Line::from(cpu_spans));
|
||||
|
||||
let freq_text = self.format_cpu_frequency();
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled(" └─ ", Typography::tree()),
|
||||
Span::styled(format!("Freq: {}", freq_text), Typography::secondary())
|
||||
]));
|
||||
|
||||
// RAM section
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("RAM:", Typography::widget_title())
|
||||
]));
|
||||
|
||||
let memory_text = self.format_memory_usage();
|
||||
let memory_spans = StatusIcons::create_status_spans(
|
||||
self.memory_status.clone(),
|
||||
&format!("Usage: {}", memory_text)
|
||||
);
|
||||
lines.push(Line::from(memory_spans));
|
||||
|
||||
// Display all tmpfs mounts
|
||||
for (i, tmpfs) in self.tmpfs_mounts.iter().enumerate() {
|
||||
let is_last = i == self.tmpfs_mounts.len() - 1;
|
||||
let tree_symbol = if is_last { " └─ " } else { " ├─ " };
|
||||
|
||||
let usage_text = if tmpfs.total_gb > 0.0 {
|
||||
format!("{:.0}% {:.1}GB/{:.1}GB",
|
||||
tmpfs.usage_percent,
|
||||
tmpfs.used_gb,
|
||||
tmpfs.total_gb)
|
||||
} else {
|
||||
"— —/—".to_string()
|
||||
};
|
||||
|
||||
let mut tmpfs_spans = vec![
|
||||
Span::styled(tree_symbol, Typography::tree()),
|
||||
];
|
||||
tmpfs_spans.extend(StatusIcons::create_status_spans(
|
||||
Status::Ok, // TODO: Calculate status based on usage_percent
|
||||
&format!("{}: {}", tmpfs.mount, usage_text)
|
||||
));
|
||||
lines.push(Line::from(tmpfs_spans));
|
||||
}
|
||||
|
||||
// Network section
|
||||
if !self.network_interfaces.is_empty() {
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("Network:", Typography::widget_title())
|
||||
]));
|
||||
|
||||
let network_lines = self.render_network();
|
||||
lines.extend(network_lines);
|
||||
}
|
||||
|
||||
// Storage section
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("Storage:", Typography::widget_title())
|
||||
]));
|
||||
|
||||
// Storage items - let main overflow logic handle truncation
|
||||
let storage_lines = self.render_storage();
|
||||
lines.extend(storage_lines);
|
||||
|
||||
// Backup section (if available)
|
||||
if self.backup_status != "unavailable" && self.backup_status != "unknown" {
|
||||
lines.push(Line::from(vec![
|
||||
Span::styled("Backup:", Typography::widget_title())
|
||||
]));
|
||||
|
||||
let backup_lines = self.render_backup();
|
||||
lines.extend(backup_lines);
|
||||
}
|
||||
|
||||
// Apply scroll offset
|
||||
let total_lines = lines.len();
|
||||
let available_height = area.height as usize;
|
||||
|
||||
// Show only what fits, with "X more below" if needed
|
||||
if total_lines > available_height {
|
||||
let lines_for_content = available_height.saturating_sub(1); // Reserve one line for "more below"
|
||||
let mut visible_lines: Vec<Line> = lines
|
||||
.into_iter()
|
||||
.take(lines_for_content)
|
||||
.collect();
|
||||
|
||||
let hidden_below = total_lines.saturating_sub(lines_for_content);
|
||||
if hidden_below > 0 {
|
||||
let more_line = Line::from(vec![
|
||||
Span::styled(format!("... {} more below", hidden_below), Typography::muted())
|
||||
]);
|
||||
visible_lines.push(more_line);
|
||||
}
|
||||
|
||||
let paragraph = Paragraph::new(Text::from(visible_lines));
|
||||
frame.render_widget(paragraph, area);
|
||||
} else {
|
||||
// All content fits and no scroll offset, render normally
|
||||
let paragraph = Paragraph::new(Text::from(lines));
|
||||
frame.render_widget(paragraph, area);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
// TODO: Implement utils module
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.0"
|
||||
version = "0.1.184"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
231
shared/src/agent_data.rs
Normal file
231
shared/src/agent_data.rs
Normal file
@@ -0,0 +1,231 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use crate::Status;
|
||||
|
||||
/// Complete structured data from an agent
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AgentData {
|
||||
pub hostname: String,
|
||||
pub agent_version: String,
|
||||
pub build_version: Option<String>,
|
||||
pub timestamp: u64,
|
||||
pub system: SystemData,
|
||||
pub services: Vec<ServiceData>,
|
||||
pub backup: BackupData,
|
||||
}
|
||||
|
||||
/// System-level monitoring data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SystemData {
|
||||
pub network: NetworkData,
|
||||
pub cpu: CpuData,
|
||||
pub memory: MemoryData,
|
||||
pub storage: StorageData,
|
||||
}
|
||||
|
||||
/// Network interface monitoring data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NetworkData {
|
||||
pub interfaces: Vec<NetworkInterfaceData>,
|
||||
}
|
||||
|
||||
/// Individual network interface data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NetworkInterfaceData {
|
||||
pub name: String,
|
||||
pub ipv4_addresses: Vec<String>,
|
||||
pub ipv6_addresses: Vec<String>,
|
||||
pub is_physical: bool,
|
||||
pub link_status: Status,
|
||||
pub parent_interface: Option<String>,
|
||||
pub vlan_id: Option<u16>,
|
||||
}
|
||||
|
||||
/// CPU monitoring data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CpuData {
|
||||
pub load_1min: f32,
|
||||
pub load_5min: f32,
|
||||
pub load_15min: f32,
|
||||
pub frequency_mhz: f32,
|
||||
pub temperature_celsius: Option<f32>,
|
||||
pub load_status: Status,
|
||||
pub temperature_status: Status,
|
||||
}
|
||||
|
||||
/// Memory monitoring data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MemoryData {
|
||||
pub usage_percent: f32,
|
||||
pub total_gb: f32,
|
||||
pub used_gb: f32,
|
||||
pub available_gb: f32,
|
||||
pub swap_total_gb: f32,
|
||||
pub swap_used_gb: f32,
|
||||
pub tmpfs: Vec<TmpfsData>,
|
||||
pub usage_status: Status,
|
||||
}
|
||||
|
||||
/// Tmpfs filesystem data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TmpfsData {
|
||||
pub mount: String,
|
||||
pub usage_percent: f32,
|
||||
pub used_gb: f32,
|
||||
pub total_gb: f32,
|
||||
}
|
||||
|
||||
/// Storage monitoring data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StorageData {
|
||||
pub drives: Vec<DriveData>,
|
||||
pub pools: Vec<PoolData>,
|
||||
}
|
||||
|
||||
/// Individual drive data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DriveData {
|
||||
pub name: String,
|
||||
pub serial_number: Option<String>,
|
||||
pub health: String,
|
||||
pub temperature_celsius: Option<f32>,
|
||||
pub wear_percent: Option<f32>,
|
||||
pub filesystems: Vec<FilesystemData>,
|
||||
pub temperature_status: Status,
|
||||
pub health_status: Status,
|
||||
}
|
||||
|
||||
/// Filesystem on a drive
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FilesystemData {
|
||||
pub mount: String,
|
||||
pub usage_percent: f32,
|
||||
pub used_gb: f32,
|
||||
pub total_gb: f32,
|
||||
pub usage_status: Status,
|
||||
}
|
||||
|
||||
/// Storage pool (MergerFS, RAID, etc.)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PoolData {
|
||||
pub name: String,
|
||||
pub mount: String,
|
||||
pub pool_type: String, // "mergerfs", "raid", etc.
|
||||
pub health: String,
|
||||
pub usage_percent: f32,
|
||||
pub used_gb: f32,
|
||||
pub total_gb: f32,
|
||||
pub data_drives: Vec<PoolDriveData>,
|
||||
pub parity_drives: Vec<PoolDriveData>,
|
||||
pub health_status: Status,
|
||||
pub usage_status: Status,
|
||||
}
|
||||
|
||||
/// Drive in a storage pool
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PoolDriveData {
|
||||
pub name: String,
|
||||
pub serial_number: Option<String>,
|
||||
pub temperature_celsius: Option<f32>,
|
||||
pub wear_percent: Option<f32>,
|
||||
pub health: String,
|
||||
pub health_status: Status,
|
||||
pub temperature_status: Status,
|
||||
}
|
||||
|
||||
/// Service monitoring data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ServiceData {
|
||||
pub name: String,
|
||||
pub memory_mb: f32,
|
||||
pub disk_gb: f32,
|
||||
pub user_stopped: bool,
|
||||
pub service_status: Status,
|
||||
pub sub_services: Vec<SubServiceData>,
|
||||
}
|
||||
|
||||
/// Sub-service data (nginx sites, docker containers, etc.)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SubServiceData {
|
||||
pub name: String,
|
||||
pub service_status: Status,
|
||||
pub metrics: Vec<SubServiceMetric>,
|
||||
}
|
||||
|
||||
/// Individual metric for a sub-service
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SubServiceMetric {
|
||||
pub label: String,
|
||||
pub value: f32,
|
||||
pub unit: Option<String>,
|
||||
}
|
||||
|
||||
/// Backup system data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BackupData {
|
||||
pub status: String,
|
||||
pub total_size_gb: Option<f32>,
|
||||
pub repository_health: Option<String>,
|
||||
pub repository_disk: Option<BackupDiskData>,
|
||||
pub last_backup_size_gb: Option<f32>,
|
||||
pub start_time_raw: Option<String>,
|
||||
}
|
||||
|
||||
/// Backup repository disk information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BackupDiskData {
|
||||
pub serial: String,
|
||||
pub usage_percent: f32,
|
||||
pub used_gb: f32,
|
||||
pub total_gb: f32,
|
||||
pub wear_percent: Option<f32>,
|
||||
pub temperature_celsius: Option<f32>,
|
||||
}
|
||||
|
||||
impl AgentData {
|
||||
/// Create new agent data with current timestamp
|
||||
pub fn new(hostname: String, agent_version: String) -> Self {
|
||||
Self {
|
||||
hostname,
|
||||
agent_version,
|
||||
build_version: None,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
system: SystemData {
|
||||
network: NetworkData {
|
||||
interfaces: Vec::new(),
|
||||
},
|
||||
cpu: CpuData {
|
||||
load_1min: 0.0,
|
||||
load_5min: 0.0,
|
||||
load_15min: 0.0,
|
||||
frequency_mhz: 0.0,
|
||||
temperature_celsius: None,
|
||||
load_status: Status::Unknown,
|
||||
temperature_status: Status::Unknown,
|
||||
},
|
||||
memory: MemoryData {
|
||||
usage_percent: 0.0,
|
||||
total_gb: 0.0,
|
||||
used_gb: 0.0,
|
||||
available_gb: 0.0,
|
||||
swap_total_gb: 0.0,
|
||||
swap_used_gb: 0.0,
|
||||
tmpfs: Vec::new(),
|
||||
usage_status: Status::Unknown,
|
||||
},
|
||||
storage: StorageData {
|
||||
drives: Vec::new(),
|
||||
pools: Vec::new(),
|
||||
},
|
||||
},
|
||||
services: Vec::new(),
|
||||
backup: BackupData {
|
||||
status: "unknown".to_string(),
|
||||
total_size_gb: None,
|
||||
repository_health: None,
|
||||
repository_disk: None,
|
||||
last_backup_size_gb: None,
|
||||
start_time_raw: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,171 +1,16 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Cache tier configuration
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct CacheTier {
|
||||
pub interval_seconds: u64,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
/// Cache configuration
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct CacheConfig {
|
||||
pub enabled: bool,
|
||||
pub default_ttl_seconds: u64,
|
||||
pub max_entries: usize,
|
||||
pub warming_timeout_seconds: u64,
|
||||
pub background_refresh_enabled: bool,
|
||||
pub cleanup_interval_seconds: u64,
|
||||
pub tiers: HashMap<String, CacheTier>,
|
||||
pub metric_assignments: HashMap<String, String>,
|
||||
pub persist_path: String,
|
||||
}
|
||||
|
||||
impl Default for CacheConfig {
|
||||
fn default() -> Self {
|
||||
let mut tiers = HashMap::new();
|
||||
tiers.insert("realtime".to_string(), CacheTier {
|
||||
interval_seconds: 2,
|
||||
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_light".to_string(), CacheTier {
|
||||
interval_seconds: 60,
|
||||
description: "Light disk operations - 1 minute (service status checks)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_medium".to_string(), CacheTier {
|
||||
interval_seconds: 300,
|
||||
description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_heavy".to_string(), CacheTier {
|
||||
interval_seconds: 900,
|
||||
description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
|
||||
});
|
||||
tiers.insert("static".to_string(), CacheTier {
|
||||
interval_seconds: 3600,
|
||||
description: "Hardware info that rarely changes - 1 hour".to_string(),
|
||||
});
|
||||
|
||||
let mut metric_assignments = HashMap::new();
|
||||
|
||||
// REALTIME (2s) - Memory/CPU operations, no disk I/O
|
||||
metric_assignments.insert("cpu_load_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("cpu_temperature_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("cpu_frequency_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("memory_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("service_*_cpu_percent".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("service_*_memory_mb".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("network_*".to_string(), "realtime".to_string());
|
||||
|
||||
// DISK_LIGHT (1min) - Light disk operations: service status checks
|
||||
metric_assignments.insert("service_*_status".to_string(), "disk_light".to_string());
|
||||
|
||||
// DISK_MEDIUM (5min) - Medium disk operations: du commands, disk usage
|
||||
metric_assignments.insert("service_*_disk_gb".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_tmp_*".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
|
||||
|
||||
// DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
|
||||
metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
|
||||
|
||||
Self {
|
||||
enabled: true,
|
||||
default_ttl_seconds: 30,
|
||||
max_entries: 10000,
|
||||
warming_timeout_seconds: 3,
|
||||
background_refresh_enabled: true,
|
||||
cleanup_interval_seconds: 1800,
|
||||
tiers,
|
||||
metric_assignments,
|
||||
persist_path: "/var/lib/cm-dashboard/cache.json".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CacheConfig {
|
||||
/// Get the cache tier for a metric name
|
||||
pub fn get_tier_for_metric(&self, metric_name: &str) -> Option<&CacheTier> {
|
||||
// Find matching pattern
|
||||
for (pattern, tier_name) in &self.metric_assignments {
|
||||
if self.matches_pattern(metric_name, pattern) {
|
||||
return self.tiers.get(tier_name);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Check if metric name matches pattern (supports wildcards)
|
||||
fn matches_pattern(&self, metric_name: &str, pattern: &str) -> bool {
|
||||
if pattern.contains('*') {
|
||||
// Convert pattern to regex-like matching
|
||||
let pattern_parts: Vec<&str> = pattern.split('*').collect();
|
||||
|
||||
if pattern_parts.len() == 2 {
|
||||
let prefix = pattern_parts[0];
|
||||
let suffix = pattern_parts[1];
|
||||
|
||||
if suffix.is_empty() {
|
||||
// Pattern like "cpu_*" - just check prefix
|
||||
metric_name.starts_with(prefix)
|
||||
} else if prefix.is_empty() {
|
||||
// Pattern like "*_status" - just check suffix
|
||||
metric_name.ends_with(suffix)
|
||||
} else {
|
||||
// Pattern like "service_*_disk_gb" - check prefix and suffix
|
||||
metric_name.starts_with(prefix) && metric_name.ends_with(suffix)
|
||||
}
|
||||
} else {
|
||||
// More complex patterns - for now, just check if all parts are present
|
||||
pattern_parts.iter().all(|part| {
|
||||
part.is_empty() || metric_name.contains(part)
|
||||
})
|
||||
}
|
||||
} else {
|
||||
metric_name == pattern
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cache interval for a metric
|
||||
pub fn get_cache_interval(&self, metric_name: &str) -> u64 {
|
||||
self.get_tier_for_metric(metric_name)
|
||||
.map(|tier| tier.interval_seconds)
|
||||
.unwrap_or(self.default_ttl_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pattern_matching() {
|
||||
let config = CacheConfig::default();
|
||||
|
||||
assert!(config.matches_pattern("cpu_load_1min", "cpu_load_*"));
|
||||
assert!(config.matches_pattern("service_nginx_disk_gb", "service_*_disk_gb"));
|
||||
assert!(!config.matches_pattern("memory_usage_percent", "cpu_load_*"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tier_assignment() {
|
||||
let config = CacheConfig::default();
|
||||
|
||||
// Realtime (2s) - CPU/Memory operations
|
||||
assert_eq!(config.get_cache_interval("cpu_load_1min"), 2);
|
||||
assert_eq!(config.get_cache_interval("memory_usage_percent"), 2);
|
||||
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 2);
|
||||
|
||||
// Disk light (60s) - Service status
|
||||
assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
|
||||
|
||||
// Disk medium (300s) - Disk usage
|
||||
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
|
||||
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
|
||||
|
||||
// Disk heavy (900s) - SMART data
|
||||
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
|
||||
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
|
||||
}
|
||||
}
|
||||
@@ -4,10 +4,10 @@ use thiserror::Error;
|
||||
pub enum SharedError {
|
||||
#[error("Serialization error: {message}")]
|
||||
Serialization { message: String },
|
||||
|
||||
|
||||
#[error("Invalid metric value: {message}")]
|
||||
InvalidMetric { message: String },
|
||||
|
||||
|
||||
#[error("Protocol error: {message}")]
|
||||
Protocol { message: String },
|
||||
}
|
||||
@@ -18,4 +18,4 @@ impl From<serde_json::Error> for SharedError {
|
||||
message: err.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
pub mod agent_data;
|
||||
pub mod cache;
|
||||
pub mod error;
|
||||
pub mod metrics;
|
||||
pub mod protocol;
|
||||
|
||||
pub use agent_data::*;
|
||||
pub use cache::*;
|
||||
pub use error::*;
|
||||
pub use metrics::*;
|
||||
pub use protocol::*;
|
||||
pub use protocol::*;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use chrono::Utc;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Individual metric with value, status, and metadata
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -23,12 +24,12 @@ impl Metric {
|
||||
unit: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn with_description(mut self, description: String) -> Self {
|
||||
self.description = Some(description);
|
||||
self
|
||||
}
|
||||
|
||||
|
||||
pub fn with_unit(mut self, unit: String) -> Self {
|
||||
self.unit = Some(unit);
|
||||
self
|
||||
@@ -52,7 +53,7 @@ impl MetricValue {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
match self {
|
||||
MetricValue::Integer(i) => Some(*i),
|
||||
@@ -60,7 +61,7 @@ impl MetricValue {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn as_string(&self) -> String {
|
||||
match self {
|
||||
MetricValue::String(s) => s.clone(),
|
||||
@@ -69,7 +70,7 @@ impl MetricValue {
|
||||
MetricValue::Boolean(b) => b.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn as_bool(&self) -> Option<bool> {
|
||||
match self {
|
||||
MetricValue::Boolean(b) => Some(*b),
|
||||
@@ -81,10 +82,13 @@ impl MetricValue {
|
||||
/// Health status for metrics
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum Status {
|
||||
Ok,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
Inactive, // Lowest priority
|
||||
Unknown, //
|
||||
Offline, //
|
||||
Pending, //
|
||||
Ok, // 5th place - good status has higher priority than unknown states
|
||||
Warning, //
|
||||
Critical, // Highest priority
|
||||
}
|
||||
|
||||
impl Status {
|
||||
@@ -100,6 +104,159 @@ impl Default for Status {
|
||||
}
|
||||
}
|
||||
|
||||
/// Hysteresis thresholds for preventing status flapping
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HysteresisThresholds {
|
||||
/// Warning threshold - trigger warning when value >= this
|
||||
pub warning_high: f32,
|
||||
/// Warning recovery - return to ok when value < this
|
||||
pub warning_low: f32,
|
||||
/// Critical threshold - trigger critical when value >= this
|
||||
pub critical_high: f32,
|
||||
/// Critical recovery - return to warning when value < this
|
||||
pub critical_low: f32,
|
||||
}
|
||||
|
||||
impl HysteresisThresholds {
|
||||
pub fn new(warning_high: f32, critical_high: f32) -> Self {
|
||||
// Default hysteresis: 10% gap for recovery
|
||||
let warning_gap = warning_high * 0.1;
|
||||
let critical_gap = critical_high * 0.1;
|
||||
|
||||
Self {
|
||||
warning_high,
|
||||
warning_low: warning_high - warning_gap,
|
||||
critical_high,
|
||||
critical_low: critical_high - critical_gap,
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate value against thresholds to determine status
|
||||
pub fn evaluate(&self, value: f32) -> Status {
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
|
||||
Self {
|
||||
warning_high,
|
||||
warning_low: warning_high - warning_gap,
|
||||
critical_high,
|
||||
critical_low: critical_high - critical_gap,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate status with hysteresis based on current value and previous status
|
||||
pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
|
||||
match previous_status {
|
||||
Status::Ok => {
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Warning => {
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value < self.warning_low {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Warning
|
||||
}
|
||||
}
|
||||
Status::Critical => {
|
||||
if value < self.critical_low {
|
||||
if value < self.warning_low {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Warning
|
||||
}
|
||||
} else {
|
||||
Status::Critical
|
||||
}
|
||||
}
|
||||
Status::Unknown => {
|
||||
// First measurement, use normal thresholds
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Inactive => {
|
||||
// Inactive services use normal thresholds like first measurement
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Pending => {
|
||||
// Service transitioning, use normal thresholds like first measurement
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
Status::Offline => {
|
||||
// Host coming back online, use normal thresholds like first measurement
|
||||
if value >= self.critical_high {
|
||||
Status::Critical
|
||||
} else if value >= self.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Status tracker for hysteresis - tracks previous status per metric
|
||||
#[derive(Debug, Default)]
|
||||
pub struct StatusTracker {
|
||||
previous_statuses: HashMap<String, Status>,
|
||||
}
|
||||
|
||||
impl StatusTracker {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Get previous status for a metric
|
||||
pub fn get_previous_status(&self, metric_name: &str) -> Status {
|
||||
self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
|
||||
}
|
||||
|
||||
/// Update status for a metric
|
||||
pub fn update_status(&mut self, metric_name: String, status: Status) {
|
||||
self.previous_statuses.insert(metric_name, status);
|
||||
}
|
||||
|
||||
/// Calculate status with hysteresis
|
||||
pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
|
||||
let previous = self.get_previous_status(metric_name);
|
||||
let new_status = thresholds.calculate_status(value, previous);
|
||||
self.update_status(metric_name.to_string(), new_status);
|
||||
new_status
|
||||
}
|
||||
}
|
||||
|
||||
/// Metric name registry - constants for all metric names
|
||||
pub mod registry {
|
||||
// CPU metrics
|
||||
@@ -109,7 +266,7 @@ pub mod registry {
|
||||
pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
|
||||
pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
|
||||
pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";
|
||||
|
||||
|
||||
// Memory metrics
|
||||
pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
|
||||
pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
|
||||
@@ -117,7 +274,7 @@ pub mod registry {
|
||||
pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
|
||||
pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
|
||||
pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";
|
||||
|
||||
|
||||
// Disk metrics (template - actual names include device)
|
||||
pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
|
||||
pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
|
||||
@@ -125,37 +282,37 @@ pub mod registry {
|
||||
pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
|
||||
pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
|
||||
pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";
|
||||
|
||||
|
||||
// Service metrics (template - actual names include service)
|
||||
pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
|
||||
pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
|
||||
pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";
|
||||
|
||||
|
||||
// Backup metrics
|
||||
pub const BACKUP_STATUS: &str = "backup_status";
|
||||
pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
|
||||
pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
|
||||
pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
|
||||
pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";
|
||||
|
||||
|
||||
// Network metrics (template - actual names include interface)
|
||||
pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
|
||||
pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
|
||||
pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
|
||||
pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";
|
||||
|
||||
|
||||
/// Generate disk metric name from template
|
||||
pub fn disk_metric(template: &str, device: &str) -> String {
|
||||
template.replace("{device}", device)
|
||||
}
|
||||
|
||||
|
||||
/// Generate service metric name from template
|
||||
pub fn service_metric(template: &str, name: &str) -> String {
|
||||
template.replace("{name}", name)
|
||||
}
|
||||
|
||||
|
||||
/// Generate network metric name from template
|
||||
pub fn network_metric(template: &str, interface: &str) -> String {
|
||||
template.replace("{interface}", interface)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,20 +1,31 @@
|
||||
use crate::agent_data::AgentData;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use crate::metrics::Metric;
|
||||
|
||||
/// Message sent from agent to dashboard via ZMQ
|
||||
/// Message sent from agent to dashboard via ZMQ
|
||||
/// Always structured data - no legacy metrics support
|
||||
pub type AgentMessage = AgentData;
|
||||
|
||||
/// Command output streaming message
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricMessage {
|
||||
pub struct CommandOutputMessage {
|
||||
pub hostname: String,
|
||||
pub command_id: String,
|
||||
pub command_type: String,
|
||||
pub output_line: String,
|
||||
pub is_complete: bool,
|
||||
pub timestamp: u64,
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
impl MetricMessage {
|
||||
pub fn new(hostname: String, metrics: Vec<Metric>) -> Self {
|
||||
|
||||
impl CommandOutputMessage {
|
||||
pub fn new(hostname: String, command_id: String, command_type: String, output_line: String, is_complete: bool) -> Self {
|
||||
Self {
|
||||
hostname,
|
||||
command_id,
|
||||
command_type,
|
||||
output_line,
|
||||
is_complete,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -35,8 +46,8 @@ pub enum Command {
|
||||
pub enum CommandResponse {
|
||||
/// Acknowledgment of command
|
||||
Ack,
|
||||
/// Metrics response
|
||||
Metrics(Vec<Metric>),
|
||||
/// Agent data response
|
||||
AgentData(AgentData),
|
||||
/// Pong response to ping
|
||||
Pong,
|
||||
/// Error response
|
||||
@@ -52,50 +63,58 @@ pub struct MessageEnvelope {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum MessageType {
|
||||
Metrics,
|
||||
AgentData,
|
||||
Command,
|
||||
CommandResponse,
|
||||
CommandOutput,
|
||||
Heartbeat,
|
||||
}
|
||||
|
||||
impl MessageEnvelope {
|
||||
pub fn metrics(message: MetricMessage) -> Result<Self, crate::SharedError> {
|
||||
pub fn agent_data(data: AgentData) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Metrics,
|
||||
payload: serde_json::to_vec(&message)?,
|
||||
message_type: MessageType::AgentData,
|
||||
payload: serde_json::to_vec(&data)?,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
pub fn command(command: Command) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Command,
|
||||
payload: serde_json::to_vec(&command)?,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
pub fn command_response(response: CommandResponse) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::CommandResponse,
|
||||
payload: serde_json::to_vec(&response)?,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
pub fn command_output(message: CommandOutputMessage) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::CommandOutput,
|
||||
payload: serde_json::to_vec(&message)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn heartbeat() -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Heartbeat,
|
||||
payload: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn decode_metrics(&self) -> Result<MetricMessage, crate::SharedError> {
|
||||
|
||||
pub fn decode_agent_data(&self) -> Result<AgentData, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::Metrics => Ok(serde_json::from_slice(&self.payload)?),
|
||||
MessageType::AgentData => Ok(serde_json::from_slice(&self.payload)?),
|
||||
_ => Err(crate::SharedError::Protocol {
|
||||
message: "Expected metrics message".to_string(),
|
||||
message: "Expected agent data message".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn decode_command(&self) -> Result<Command, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::Command => Ok(serde_json::from_slice(&self.payload)?),
|
||||
@@ -104,7 +123,7 @@ impl MessageEnvelope {
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn decode_command_response(&self) -> Result<CommandResponse, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::CommandResponse => Ok(serde_json::from_slice(&self.payload)?),
|
||||
@@ -113,4 +132,13 @@ impl MessageEnvelope {
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_command_output(&self) -> Result<CommandOutputMessage, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::CommandOutput => Ok(serde_json::from_slice(&self.payload)?),
|
||||
_ => Err(crate::SharedError::Protocol {
|
||||
message: "Expected command output message".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,152 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Test script for smart caching agent
|
||||
# Debug why only System collector works but Services/SMART/Backup don't
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== CM Dashboard Smart Agent Debug Test ==="
|
||||
echo "Testing smart caching implementation..."
|
||||
echo
|
||||
|
||||
# Build the agent first
|
||||
echo "Building agent..."
|
||||
OPENSSL_DIR=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev \
|
||||
OPENSSL_LIB_DIR=/nix/store/0837wpkjb27cr70bi3pc4g2rw5v9r63l-openssl-3.5.2/lib \
|
||||
OPENSSL_INCLUDE_DIR=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev/include \
|
||||
PKG_CONFIG_PATH=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev/lib/pkgconfig \
|
||||
OPENSSL_NO_VENDOR=1 cargo build --workspace --release
|
||||
echo "✓ Build completed"
|
||||
echo
|
||||
|
||||
# Test 1: Verify agent starts and shows all collectors
|
||||
echo "Test 1: Agent startup and collector initialization"
|
||||
timeout 15s ./target/release/cm-dashboard-agent -v 2>&1 | tee /tmp/agent_startup.log &
|
||||
AGENT_PID=$!
|
||||
sleep 8
|
||||
|
||||
if kill -0 $AGENT_PID 2>/dev/null; then
|
||||
echo "✓ Smart agent started successfully"
|
||||
kill $AGENT_PID 2>/dev/null || true
|
||||
wait $AGENT_PID 2>/dev/null || true
|
||||
else
|
||||
echo "✗ Smart agent failed to start"
|
||||
exit 1
|
||||
fi
|
||||
echo
|
||||
|
||||
# Test 2: Analyze startup logs for collector initialization
|
||||
echo "Test 2: Collector initialization analysis"
|
||||
echo "Looking for collector setup messages:"
|
||||
grep -E "(monitoring|collector|initialized)" /tmp/agent_startup.log || true
|
||||
echo
|
||||
|
||||
echo "Looking for cache-related messages:"
|
||||
grep -E "(cache|warming|tier)" /tmp/agent_startup.log || true
|
||||
echo
|
||||
|
||||
echo "Looking for error messages:"
|
||||
grep -E "(error|failed|Error)" /tmp/agent_startup.log || true
|
||||
echo
|
||||
|
||||
# Test 3: Check if all expected collectors are mentioned
|
||||
echo "Test 3: Expected collector verification"
|
||||
EXPECTED_COLLECTORS=("SMART monitoring" "System monitoring" "Service monitoring" "Backup monitoring")
|
||||
for collector in "${EXPECTED_COLLECTORS[@]}"; do
|
||||
if grep -q "$collector" /tmp/agent_startup.log; then
|
||||
echo "✓ Found: $collector"
|
||||
else
|
||||
echo "✗ Missing: $collector"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
# Test 4: ZMQ message inspection (run agent for 20 seconds and capture messages)
|
||||
echo "Test 4: ZMQ message capture and analysis"
|
||||
echo "Starting agent and capturing ZMQ messages for 20 seconds..."
|
||||
|
||||
# Start the agent in background
|
||||
timeout 25s ./target/release/cm-dashboard-agent -v > /tmp/agent_output.log 2>&1 &
|
||||
AGENT_PID=$!
|
||||
|
||||
# Give agent time to start and warm cache
|
||||
sleep 5
|
||||
|
||||
# Use netcat or ss to check ZMQ port
|
||||
echo "Checking ZMQ port 6130:"
|
||||
ss -tlnp | grep 6130 || echo "ZMQ port not found"
|
||||
|
||||
# Monitor for a bit more
|
||||
sleep 15
|
||||
|
||||
# Stop agent
|
||||
if kill -0 $AGENT_PID 2>/dev/null; then
|
||||
kill $AGENT_PID 2>/dev/null || true
|
||||
wait $AGENT_PID 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo "Agent output analysis:"
|
||||
echo "Total lines of output: $(wc -l < /tmp/agent_output.log)"
|
||||
echo
|
||||
|
||||
echo "Cache-related messages:"
|
||||
grep -E "(cache|Cache|warming|Warming|tier|Tier)" /tmp/agent_output.log | head -10 || echo "No cache messages found"
|
||||
echo
|
||||
|
||||
echo "Collection messages:"
|
||||
grep -E "(collection|Collection|collected|Collected)" /tmp/agent_output.log | head -10 || echo "No collection messages found"
|
||||
echo
|
||||
|
||||
echo "Error messages:"
|
||||
grep -E "(error|Error|failed|Failed)" /tmp/agent_output.log || echo "No errors found"
|
||||
echo
|
||||
|
||||
# Test 5: Check tier assignment
|
||||
echo "Test 5: Cache tier analysis"
|
||||
echo "Searching for tier assignments in startup:"
|
||||
grep -E "(RealTime|Fast|Medium|Slow|Static)" /tmp/agent_startup.log || echo "No tier information found"
|
||||
echo
|
||||
|
||||
# Test 6: Collection interval analysis
|
||||
echo "Test 6: Collection interval verification"
|
||||
echo "Expected intervals:"
|
||||
echo "- System (RealTime): 5 seconds"
|
||||
echo "- Services (Medium): 5 minutes"
|
||||
echo "- SMART (Slow): 15 minutes"
|
||||
echo "- Backup (Slow): 15 minutes"
|
||||
echo
|
||||
|
||||
echo "Actual intervals found in logs:"
|
||||
grep -E "(\d+\w+ intervals|\d+s intervals|\d+min intervals)" /tmp/agent_startup.log || echo "No interval information found"
|
||||
echo
|
||||
|
||||
# Test 7: Manual collector test (if possible)
|
||||
echo "Test 7: Service discovery test"
|
||||
echo "Checking what services would be discovered:"
|
||||
if [ -f "./target/release/cm-dashboard-agent" ]; then
|
||||
echo "Services that should be monitored:"
|
||||
systemctl list-units --state=active --type=service | grep -E "(gitea|immich|postgres|unifi|vaultwarden|nginx|docker|ssh)" | head -5 || echo "No interesting services found"
|
||||
fi
|
||||
echo
|
||||
|
||||
# Test 8: Check for threading issues
|
||||
echo "Test 8: Threading and async analysis"
|
||||
echo "Looking for async/threading issues:"
|
||||
grep -E "(tokio|async|await|thread)" /tmp/agent_output.log | head -5 || echo "No async-related messages"
|
||||
echo
|
||||
|
||||
echo "=== Test Summary ==="
|
||||
echo "Agent startup log: /tmp/agent_startup.log"
|
||||
echo "Agent runtime log: /tmp/agent_output.log"
|
||||
echo
|
||||
echo "Key findings:"
|
||||
echo "1. Agent starts: $([ -f /tmp/agent_startup.log ] && echo "✓" || echo "✗")"
|
||||
echo "2. Collectors found: $(grep -c "monitoring" /tmp/agent_startup.log 2>/dev/null || echo "0")"
|
||||
echo "3. Cache messages: $(grep -c -i cache /tmp/agent_output.log 2>/dev/null || echo "0")"
|
||||
echo "4. Errors found: $(grep -c -i error /tmp/agent_output.log 2>/dev/null || echo "0")"
|
||||
echo
|
||||
echo "Next steps if issues found:"
|
||||
echo "- Check collector initialization in smart_agent.rs"
|
||||
echo "- Verify cache tier assignments and intervals"
|
||||
echo "- Debug collection scheduling in collect_tier() method"
|
||||
echo "- Test individual collectors outside of smart caching"
|
||||
Reference in New Issue
Block a user