clawdie-ai/scripts/heartbeat.sh

176 lines
6.8 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# heartbeat.sh — System health check + HEARTBEAT.md writer
# Runs on cron every 30 min (or via rc.d loop)
# LLM interpretation is optional — health checks run regardless
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
# Load .env
if [ -f "$PROJECT_DIR/.env" ]; then
set -a
# shellcheck disable=SC1091
. "$PROJECT_DIR/.env"
set +a
fi
# Config (with defaults — derive llama-cpp IP from env, never hardcode)
_LLAMA_IP="${WARDEN_LLAMA_CPP_IP:-${JAIL_SUBNET_BASE:-10.0.1}.5}"
CHAT_BASE_URL="${CHAT_BASE_URL:-http://${_LLAMA_IP}:8081/v1}"
CHAT_MODEL="${CHAT_MODEL:-dolphin3.0-phi4-mini}"
EMBED_BASE_URL="${EMBED_BASE_URL:-http://${_LLAMA_IP}:8080/v1}"
HEARTBEAT_FILE="$PROJECT_DIR/HEARTBEAT.md"
LOG_DIR="$PROJECT_DIR/logs"
TENANT_ID="${TENANT_ID:-clawdie}"
mkdir -p "$LOG_DIR"
NOW="$(date '+%d.%b.%Y %H:%M')"
NOW_SORT="$(date '+%Y-%m-%d %H:%M:%S')"
# ── Health checks ─────────────────────────────────────────────────────────────
STATUS="OK"
FLAGS=""
_flag() {
local level="$1" msg="$2"
FLAGS="${FLAGS}- ${level}: ${msg}\n"
[ "$level" = "CRIT" ] && STATUS="CRIT"
[ "$level" = "WARN" ] && [ "$STATUS" != "CRIT" ] && STATUS="WARN"
}
# Jails — try prefixed name first, fall back to short name
# Handles legacy prefixed naming; llama-cpp → llamacpp variant
JAIL_STATUS=""
BASTILLE_LIST="$("${SCRIPT_DIR:-$(dirname "$0")}/hostd-call.sh" bastille-list 2>/dev/null || true)"
_jail_up() {
local name="$1"
printf '%s' "$BASTILLE_LIST" | awk '{print $2}' | grep -qxF "$name"
}
for role in controlplane db cms llama-cpp; do
role_short="${role//-/}" # llama-cpp → llamacpp
prefixed="${TENANT_ID}-${role}"
if _jail_up "$prefixed"; then
JAIL_STATUS="${JAIL_STATUS} ${prefixed} ✓\n"
elif _jail_up "$role"; then
JAIL_STATUS="${JAIL_STATUS} ${role} ✓\n"
elif _jail_up "$role_short"; then
JAIL_STATUS="${JAIL_STATUS} ${role_short} ✓\n"
else
JAIL_STATUS="${JAIL_STATUS} ${prefixed} ✗\n"
_flag "WARN" "Jail ${role} not running"
fi
done
# PostgreSQL / memory DB
. "$SCRIPT_DIR/memory/common.sh" 2>/dev/null || true
MEM_COUNT="?"
MEM_LATEST="?"
if command -v psql >/dev/null 2>&1 && [ -n "${DB_HOST:-}" ]; then
MEM_COUNT=$(PGPASSWORD="$MEMORY_DB_PASSWORD" psql -h "$DB_HOST" -p "${DB_PORT:-5432}" -U "$DB_USER" -d "$DB_NAME" \
--no-align --tuples-only --quiet --connect-timeout=5 \
-c "SELECT count(*) FROM memories;" 2>/dev/null || echo "?")
MEM_LATEST=$(PGPASSWORD="$MEMORY_DB_PASSWORD" psql -h "$DB_HOST" -p "${DB_PORT:-5432}" -U "$DB_USER" -d "$DB_NAME" \
--no-align --tuples-only --quiet --connect-timeout=5 \
-c "SELECT to_char(max(created_at), 'DD.Mon.YYYY HH24:MI') FROM memories;" 2>/dev/null || echo "?")
[ "$MEM_COUNT" = "?" ] && _flag "CRIT" "PostgreSQL unreachable"
fi
# Embedding server
# For local llama-cpp servers check /health; external APIs (OpenAI etc.) are assumed up.
EMBED_STATUS="✗"
EMBED_HOST="${EMBED_BASE_URL%%/v1*}" # strip /v1 suffix
if [[ "$EMBED_BASE_URL" =~ (localhost|127\.|192\.168\.|10\.) ]]; then
if curl -sf --max-time 5 "${EMBED_HOST}/health" >/dev/null 2>&1; then
EMBED_STATUS="✓"
else
_flag "WARN" "Embedding server unreachable at ${EMBED_HOST}"
fi
else
# External API — mark as up; network failures will surface via memory search errors
EMBED_STATUS="✓ (ext)"
fi
# Chat server
CHAT_STATUS="✗"
if curl -sf --max-time 5 "${CHAT_BASE_URL%/v1}/health" >/dev/null 2>&1; then
CHAT_STATUS="✓"
fi
# Chat down is not a WARN — it's optional for heartbeat
# Disk
ZFS_USED="?"
if command -v zfs >/dev/null 2>&1; then
ZFS_USED=$(zfs list -H -o used,avail zroot 2>/dev/null \
| awk '{printf "%s used / %s avail", $1, $2}' || echo "?")
fi
# ── LLM interpretation ────────────────────────────────────────────────────────
SUMMARY="LLM unavailable — raw data only."
if [ "$CHAT_STATUS" = "✓" ]; then
HEALTH_SNAPSHOT="Jails: $(printf '%b' "$JAIL_STATUS" | tr '\n' '|') | Memories: ${MEM_COUNT} (latest: ${MEM_LATEST}) | Embed: ${EMBED_STATUS} | Disk: ${ZFS_USED}"
PROMPT="You are ${TENANT_ID}'s system monitor. Current state: ${HEALTH_SNAPSHOT}. Flags: ${FLAGS:-none}. Summarise in 2 sentences, mention any concerns."
SUMMARY=$(curl -sf --max-time 15 "${CHAT_BASE_URL}/chat/completions" \
-H "Content-Type: application/json" \
-d "{\"model\":\"${CHAT_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":$(printf '%s' "$PROMPT" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')}],\"max_tokens\":120,\"temperature\":0.3}" \
2>/dev/null \
| python3 -c 'import json,sys; d=json.load(sys.stdin); print(d["choices"][0]["message"]["content"].strip())' \
2>/dev/null || echo "LLM parse error — raw data only.")
fi
# ── Write HEARTBEAT.md ────────────────────────────────────────────────────────
{
echo "# System Heartbeat"
echo ""
echo "> Last checked: ${NOW} | Status: **${STATUS}**"
echo ""
echo "## Health"
printf '%b' "$JAIL_STATUS"
echo "- PostgreSQL: ${MEM_COUNT} memories, latest ${MEM_LATEST}"
echo "- embed (${EMBED_BASE_URL##*//*}) ${EMBED_STATUS} chat :8081 ${CHAT_STATUS}"
echo "- Disk: ${ZFS_USED}"
echo ""
echo "## Memory Snapshot"
echo "- ${MEM_COUNT} memories | latest: ${MEM_LATEST}"
echo ""
echo "## Summary"
echo "$SUMMARY"
echo ""
echo "## Flags"
if [ -n "$FLAGS" ]; then
printf '%b' "$FLAGS"
else
echo "(none)"
fi
echo ""
echo "---"
echo "_Generated by scripts/heartbeat.sh at ${NOW_SORT}_"
} > "$HEARTBEAT_FILE"
# ── Telegram notification on WARN/CRIT ───────────────────────────────────────
NOTIFY="${HEARTBEAT_NOTIFY_TELEGRAM:-false}"
if [ "$NOTIFY" = "true" ] && [ "$STATUS" != "OK" ]; then
ICON="⚠️"
[ "$STATUS" = "CRIT" ] && ICON="🚨"
MSG="${ICON} ${TENANT_ID} heartbeat ${STATUS}${NOW}
${SUMMARY}
Flags:
$(printf '%b' "$FLAGS")"
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_MAIN_CHAT_ID:-}" ]; then
curl -sf --max-time 10 "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_MAIN_CHAT_ID}" \
--data-urlencode "text=${MSG}" \
>/dev/null 2>&1 || true
fi
fi
echo "[${NOW_SORT}] heartbeat ${STATUS}${MEM_COUNT} memories, embed ${EMBED_STATUS}, chat ${CHAT_STATUS}"