Route remaining sudo call sites through hostd-call.sh / hostd:
- scripts/destroy-jails.sh: bastille stop/destroy via hostd-call.sh
- scripts/docs-sync.cron.sh: nginx reload via service-restart op
- scripts/heartbeat.sh: bastille list via hostd-call.sh
- src/startup-report.ts: drop sudo bastille/pkg fallbacks; tighten
buildStartupReport signature now that hostdData is always supplied
Relies on 537c613 (non-interactive bastille-destroy) so the
yes-pipe in destroy-jails.sh is no longer needed.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
175 lines
6.8 KiB
Bash
Executable file
175 lines
6.8 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# heartbeat.sh — System health check + HEARTBEAT.md writer
|
|
# Runs on cron every 30 min (or via rc.d loop)
|
|
# LLM interpretation is optional — health checks run regardless
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
|
# Load .env
|
|
if [ -f "$PROJECT_DIR/.env" ]; then
|
|
set -a
|
|
# shellcheck disable=SC1091
|
|
. "$PROJECT_DIR/.env"
|
|
set +a
|
|
fi
|
|
|
|
# Config (with defaults — derive llama-cpp IP from env, never hardcode)
|
|
_LLAMA_IP="${WARDEN_LLAMA_CPP_IP:-${JAIL_SUBNET_BASE:-10.0.1}.5}"
|
|
CHAT_BASE_URL="${CHAT_BASE_URL:-http://${_LLAMA_IP}:8081/v1}"
|
|
CHAT_MODEL="${CHAT_MODEL:-dolphin3.0-phi4-mini}"
|
|
EMBED_BASE_URL="${EMBED_BASE_URL:-http://${_LLAMA_IP}:8080/v1}"
|
|
HEARTBEAT_FILE="$PROJECT_DIR/HEARTBEAT.md"
|
|
LOG_DIR="$PROJECT_DIR/logs"
|
|
TENANT_ID="${TENANT_ID:-clawdie}"
|
|
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
NOW="$(date '+%d.%b.%Y %H:%M')"
|
|
NOW_SORT="$(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
# ── Health checks ─────────────────────────────────────────────────────────────
|
|
|
|
STATUS="OK"
|
|
FLAGS=""
|
|
|
|
_flag() {
|
|
local level="$1" msg="$2"
|
|
FLAGS="${FLAGS}- ${level}: ${msg}\n"
|
|
[ "$level" = "CRIT" ] && STATUS="CRIT"
|
|
[ "$level" = "WARN" ] && [ "$STATUS" != "CRIT" ] && STATUS="WARN"
|
|
}
|
|
|
|
# Jails — try prefixed name first, fall back to short name
|
|
# Handles legacy prefixed naming; llama-cpp → llamacpp variant
|
|
JAIL_STATUS=""
|
|
BASTILLE_LIST="$("${SCRIPT_DIR:-$(dirname "$0")}/hostd-call.sh" bastille-list 2>/dev/null || true)"
|
|
_jail_up() {
|
|
local name="$1"
|
|
printf '%s' "$BASTILLE_LIST" | awk '{print $2}' | grep -qxF "$name"
|
|
}
|
|
for role in controlplane db cms llama-cpp; do
|
|
role_short="${role//-/}" # llama-cpp → llamacpp
|
|
prefixed="${TENANT_ID}-${role}"
|
|
if _jail_up "$prefixed"; then
|
|
JAIL_STATUS="${JAIL_STATUS} ${prefixed} ✓\n"
|
|
elif _jail_up "$role"; then
|
|
JAIL_STATUS="${JAIL_STATUS} ${role} ✓\n"
|
|
elif _jail_up "$role_short"; then
|
|
JAIL_STATUS="${JAIL_STATUS} ${role_short} ✓\n"
|
|
else
|
|
JAIL_STATUS="${JAIL_STATUS} ${prefixed} ✗\n"
|
|
_flag "WARN" "Jail ${role} not running"
|
|
fi
|
|
done
|
|
|
|
# PostgreSQL / memory DB
|
|
. "$SCRIPT_DIR/memory/common.sh" 2>/dev/null || true
|
|
MEM_COUNT="?"
|
|
MEM_LATEST="?"
|
|
if command -v psql >/dev/null 2>&1 && [ -n "${DB_HOST:-}" ]; then
|
|
MEM_COUNT=$(PGPASSWORD="$MEMORY_DB_PASSWORD" psql -h "$DB_HOST" -p "${DB_PORT:-5432}" -U "$DB_USER" -d "$DB_NAME" \
|
|
--no-align --tuples-only --quiet --connect-timeout=5 \
|
|
-c "SELECT count(*) FROM memories;" 2>/dev/null || echo "?")
|
|
MEM_LATEST=$(PGPASSWORD="$MEMORY_DB_PASSWORD" psql -h "$DB_HOST" -p "${DB_PORT:-5432}" -U "$DB_USER" -d "$DB_NAME" \
|
|
--no-align --tuples-only --quiet --connect-timeout=5 \
|
|
-c "SELECT to_char(max(created_at), 'DD.Mon.YYYY HH24:MI') FROM memories;" 2>/dev/null || echo "?")
|
|
[ "$MEM_COUNT" = "?" ] && _flag "CRIT" "PostgreSQL unreachable"
|
|
fi
|
|
|
|
# Embedding server
|
|
# For local llama-cpp servers check /health; external APIs (OpenAI etc.) are assumed up.
|
|
EMBED_STATUS="✗"
|
|
EMBED_HOST="${EMBED_BASE_URL%%/v1*}" # strip /v1 suffix
|
|
if [[ "$EMBED_BASE_URL" =~ (localhost|127\.|192\.168\.|10\.) ]]; then
|
|
if curl -sf --max-time 5 "${EMBED_HOST}/health" >/dev/null 2>&1; then
|
|
EMBED_STATUS="✓"
|
|
else
|
|
_flag "WARN" "Embedding server unreachable at ${EMBED_HOST}"
|
|
fi
|
|
else
|
|
# External API — mark as up; network failures will surface via memory search errors
|
|
EMBED_STATUS="✓ (ext)"
|
|
fi
|
|
|
|
# Chat server
|
|
CHAT_STATUS="✗"
|
|
if curl -sf --max-time 5 "${CHAT_BASE_URL%/v1}/health" >/dev/null 2>&1; then
|
|
CHAT_STATUS="✓"
|
|
fi
|
|
# Chat down is not a WARN — it's optional for heartbeat
|
|
|
|
# Disk
|
|
ZFS_USED="?"
|
|
if command -v zfs >/dev/null 2>&1; then
|
|
ZFS_USED=$(zfs list -H -o used,avail zroot 2>/dev/null \
|
|
| awk '{printf "%s used / %s avail", $1, $2}' || echo "?")
|
|
fi
|
|
|
|
# ── LLM interpretation ────────────────────────────────────────────────────────
|
|
|
|
SUMMARY="LLM unavailable — raw data only."
|
|
if [ "$CHAT_STATUS" = "✓" ]; then
|
|
HEALTH_SNAPSHOT="Jails: $(printf '%b' "$JAIL_STATUS" | tr '\n' '|') | Memories: ${MEM_COUNT} (latest: ${MEM_LATEST}) | Embed: ${EMBED_STATUS} | Disk: ${ZFS_USED}"
|
|
PROMPT="You are ${TENANT_ID}'s system monitor. Current state: ${HEALTH_SNAPSHOT}. Flags: ${FLAGS:-none}. Summarise in 2 sentences, mention any concerns."
|
|
|
|
SUMMARY=$(curl -sf --max-time 15 "${CHAT_BASE_URL}/chat/completions" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"model\":\"${CHAT_MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":$(printf '%s' "$PROMPT" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')}],\"max_tokens\":120,\"temperature\":0.3}" \
|
|
2>/dev/null \
|
|
| python3 -c 'import json,sys; d=json.load(sys.stdin); print(d["choices"][0]["message"]["content"].strip())' \
|
|
2>/dev/null || echo "LLM parse error — raw data only.")
|
|
fi
|
|
|
|
# ── Write HEARTBEAT.md ────────────────────────────────────────────────────────
|
|
|
|
{
|
|
echo "# System Heartbeat"
|
|
echo ""
|
|
echo "> Last checked: ${NOW} | Status: **${STATUS}**"
|
|
echo ""
|
|
echo "## Health"
|
|
printf '%b' "$JAIL_STATUS"
|
|
echo "- PostgreSQL: ${MEM_COUNT} memories, latest ${MEM_LATEST}"
|
|
echo "- embed (${EMBED_BASE_URL##*//*}) ${EMBED_STATUS} chat :8081 ${CHAT_STATUS}"
|
|
echo "- Disk: ${ZFS_USED}"
|
|
echo ""
|
|
echo "## Memory Snapshot"
|
|
echo "- ${MEM_COUNT} memories | latest: ${MEM_LATEST}"
|
|
echo ""
|
|
echo "## Summary"
|
|
echo "$SUMMARY"
|
|
echo ""
|
|
echo "## Flags"
|
|
if [ -n "$FLAGS" ]; then
|
|
printf '%b' "$FLAGS"
|
|
else
|
|
echo "(none)"
|
|
fi
|
|
echo ""
|
|
echo "---"
|
|
echo "_Generated by scripts/heartbeat.sh at ${NOW_SORT}_"
|
|
} > "$HEARTBEAT_FILE"
|
|
|
|
# ── Telegram notification on WARN/CRIT ───────────────────────────────────────
|
|
|
|
NOTIFY="${HEARTBEAT_NOTIFY_TELEGRAM:-false}"
|
|
if [ "$NOTIFY" = "true" ] && [ "$STATUS" != "OK" ]; then
|
|
ICON="⚠️"
|
|
[ "$STATUS" = "CRIT" ] && ICON="🚨"
|
|
MSG="${ICON} ${TENANT_ID} heartbeat ${STATUS} — ${NOW}
|
|
${SUMMARY}
|
|
Flags:
|
|
$(printf '%b' "$FLAGS")"
|
|
|
|
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_MAIN_CHAT_ID:-}" ]; then
|
|
curl -sf --max-time 10 "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_MAIN_CHAT_ID}" \
|
|
--data-urlencode "text=${MSG}" \
|
|
>/dev/null 2>&1 || true
|
|
fi
|
|
fi
|
|
|
|
echo "[${NOW_SORT}] heartbeat ${STATUS} — ${MEM_COUNT} memories, embed ${EMBED_STATUS}, chat ${CHAT_STATUS}"
|