cm-dashboard/test_smart_agent.sh

#!/bin/bash

# Test script for smart caching agent
# Debug why only System collector works but Services/SMART/Backup don't

set -e

echo "=== CM Dashboard Smart Agent Debug Test ==="
echo "Testing smart caching implementation..."
echo

# Build the agent first
echo "Building agent..."
OPENSSL_DIR=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev \
OPENSSL_LIB_DIR=/nix/store/0837wpkjb27cr70bi3pc4g2rw5v9r63l-openssl-3.5.2/lib \
OPENSSL_INCLUDE_DIR=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev/include \
PKG_CONFIG_PATH=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev/lib/pkgconfig \
OPENSSL_NO_VENDOR=1 cargo build --workspace --release
echo "✓ Build completed"
echo

# Test 1: Verify agent starts and shows all collectors
echo "Test 1: Agent startup and collector initialization"
timeout 15s ./target/release/cm-dashboard-agent -v 2>&1 | tee /tmp/agent_startup.log &
AGENT_PID=$!
sleep 8

if kill -0 $AGENT_PID 2>/dev/null; then
    echo "✓ Smart agent started successfully"
    kill $AGENT_PID 2>/dev/null || true
    wait $AGENT_PID 2>/dev/null || true
else
    echo "✗ Smart agent failed to start"
    exit 1
fi
echo

# Test 2: Analyze startup logs for collector initialization
echo "Test 2: Collector initialization analysis"
echo "Looking for collector setup messages:"
grep -E "(monitoring|collector|initialized)" /tmp/agent_startup.log || true
echo

echo "Looking for cache-related messages:"
grep -E "(cache|warming|tier)" /tmp/agent_startup.log || true
echo

echo "Looking for error messages:"
grep -E "(error|failed|Error)" /tmp/agent_startup.log || true
echo

# Test 3: Check if all expected collectors are mentioned
echo "Test 3: Expected collector verification"
EXPECTED_COLLECTORS=("SMART monitoring" "System monitoring" "Service monitoring" "Backup monitoring")
for collector in "${EXPECTED_COLLECTORS[@]}"; do
    if grep -q "$collector" /tmp/agent_startup.log; then
        echo "✓ Found: $collector"
    else
        echo "✗ Missing: $collector"
    fi
done
echo

# Test 4: ZMQ message inspection (run agent for 20 seconds and capture messages)
echo "Test 4: ZMQ message capture and analysis"
echo "Starting agent and capturing ZMQ messages for 20 seconds..."

# Start the agent in background
timeout 25s ./target/release/cm-dashboard-agent -v > /tmp/agent_output.log 2>&1 &
AGENT_PID=$!

# Give agent time to start and warm cache
sleep 5

# Use netcat or ss to check ZMQ port
echo "Checking ZMQ port 6130:"
ss -tlnp | grep 6130 || echo "ZMQ port not found"

# Monitor for a bit more
sleep 15

# Stop agent
if kill -0 $AGENT_PID 2>/dev/null; then
    kill $AGENT_PID 2>/dev/null || true
    wait $AGENT_PID 2>/dev/null || true
fi

echo "Agent output analysis:"
echo "Total lines of output: $(wc -l < /tmp/agent_output.log)"
echo

echo "Cache-related messages:"
grep -E "(cache|Cache|warming|Warming|tier|Tier)" /tmp/agent_output.log | head -10 || echo "No cache messages found"
echo

echo "Collection messages:"
grep -E "(collection|Collection|collected|Collected)" /tmp/agent_output.log | head -10 || echo "No collection messages found"
echo

echo "Error messages:"
grep -E "(error|Error|failed|Failed)" /tmp/agent_output.log || echo "No errors found"
echo

# Test 5: Check tier assignment
echo "Test 5: Cache tier analysis"
echo "Searching for tier assignments in startup:"
grep -E "(RealTime|Fast|Medium|Slow|Static)" /tmp/agent_startup.log || echo "No tier information found"
echo

# Test 6: Collection interval analysis
echo "Test 6: Collection interval verification"
echo "Expected intervals:"
echo "- System (RealTime): 5 seconds"
echo "- Services (Medium): 5 minutes"
echo "- SMART (Slow): 15 minutes"
echo "- Backup (Slow): 15 minutes"
echo

echo "Actual intervals found in logs:"
grep -E "(\d+\w+ intervals|\d+s intervals|\d+min intervals)" /tmp/agent_startup.log || echo "No interval information found"
echo

# Test 7: Manual collector test (if possible)
echo "Test 7: Service discovery test"
echo "Checking what services would be discovered:"
if [ -f "./target/release/cm-dashboard-agent" ]; then
    echo "Services that should be monitored:"
    systemctl list-units --state=active --type=service | grep -E "(gitea|immich|postgres|unifi|vaultwarden|nginx|docker|ssh)" | head -5 || echo "No interesting services found"
fi
echo

# Test 8: Check for threading issues
echo "Test 8: Threading and async analysis"
echo "Looking for async/threading issues:"
grep -E "(tokio|async|await|thread)" /tmp/agent_output.log | head -5 || echo "No async-related messages"
echo

echo "=== Test Summary ==="
echo "Agent startup log: /tmp/agent_startup.log"
echo "Agent runtime log: /tmp/agent_output.log"
echo
echo "Key findings:"
echo "1. Agent starts: $([ -f /tmp/agent_startup.log ] && echo "✓" || echo "✗")"
echo "2. Collectors found: $(grep -c "monitoring" /tmp/agent_startup.log 2>/dev/null || echo "0")"
echo "3. Cache messages: $(grep -c -i cache /tmp/agent_output.log 2>/dev/null || echo "0")"
echo "4. Errors found: $(grep -c -i error /tmp/agent_output.log 2>/dev/null || echo "0")"
echo
echo "Next steps if issues found:"
echo "- Check collector initialization in smart_agent.rs"
echo "- Verify cache tier assignments and intervals"
echo "- Debug collection scheduling in collect_tier() method"
echo "- Test individual collectors outside of smart caching"