clawdie-iso/firstboot/maintenance-mode.sh
Sam & Claude 18586d3f09 Harden firstboot scripts: POSIX fixes, quoting, offline pkg path
- shell-system.sh: exit → return in sourced module (would kill firstboot)
- maintenance-mode.sh: replace bash += with POSIX concat, fix subshell
  scope in pipe-to-while-read (vdev_status/failed_disks were always
  empty), quote all $POOL_NAME and $disk expansions
- build-vps.sh: portable _sed_i() wrapper for FreeBSD/Linux
- firstboot.sh: set -eu; set USB_PKG_PATH to SHARE/packages so offline
  package repo works after HDD boot (was defaulting to /mnt/media)
- firstboot-vps.sh: remove plaintext password log, check loader.efi
  exists before EFI copy

Integration test: PASS (7/7 modules)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-04 20:04:22 +02:00

486 lines
16 KiB
Bash

#!/bin/sh
# maintenance-mode.sh — Clawdie pool maintenance and recovery
#
# Provides:
# 1. Pool health check
# 2. Replace failed disk
# 3. Scrub pool
# 4. Disk space cleanup
# 5. Boot environment management
# 6. Pool migration
# 7. Import read-only (data recovery)
set -e
POOL_NAME="clawdie"
LOG="/var/log/clawdie-maintenance.log"
SHARE="/usr/local/share/clawdie-iso"
dialog() { bsddialog --backtitle "Clawdie Maintenance Mode" "$@" ; }
die() { echo "ERROR: $1" >&2; exit 1; }
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG"; }
. "${SHARE}/build.cfg"
get_disk_info() {
local disk="$1"
local info=""
local model=$(camcontrol inquiry "$disk" 2>/dev/null | head -1)
local serial=$(camcontrol identify "$disk" 2>/dev/null | grep -i 'serial number' | cut -d: -f2 | tr -d ' ')
local size_bytes=$(diskinfo "$disk" 2>/dev/null | awk '{print $3}')
local size_gb=$((size_bytes / 1073741824))
local temp=$(smartctl -A "$disk" 2>/dev/null | grep -i temperature | awk '{print $10}')
local health=$(smartctl -H "$disk" 2>/dev/null | grep -E '^SMART overall-health' | awk '{print $NF}')
local slot=$(sesutil map 2>/dev/null | grep "^${disk}" | awk '{print $3}')
info="${disk}${model} (${size_gb}GB)"
[ -n "$serial" ] && info="${info} S/N:${serial}"
[ -n "$temp" ] && info="${info} ${temp}°C"
[ -n "$health" ] && info="${info} ${health}"
[ -n "$slot" ] && info="${info} Slot:${slot}"
echo "$info"
}
import_pool_rw() {
log "Importing pool $POOL_NAME (read-write)"
if zpool list "$POOL_NAME" >/dev/null 2>&1; then
log "Pool already imported"
return 0
fi
if ! zpool import "$POOL_NAME" 2>&1 | tee -a "$LOG"; then
dialog --msgbox "\nFailed to import pool '$POOL_NAME'.\n\nCheck logs: $LOG" 10 50
return 1
fi
log "Pool imported successfully"
return 0
}
import_pool_ro() {
log "Importing pool $POOL_NAME (read-only)"
if ! zpool import -o readonly=on "$POOL_NAME" 2>&1 | tee -a "$LOG"; then
dialog --msgbox "\nFailed to import pool '$POOL_NAME' (read-only).\n\nCheck logs: $LOG" 10 50
return 1
fi
log "Pool imported read-only"
return 0
}
export_pool() {
log "Exporting pool $POOL_NAME"
zpool export "$POOL_NAME" 2>&1 | tee -a "$LOG"
}
menu_pool_health() {
log "Pool health check selected"
import_pool_rw || return 1
local status=$(zpool status "$POOL_NAME" 2>/dev/null)
local state=$(echo "$status" | grep "state:" | head -1)
local scan=$(echo "$status" | grep -A2 "scan:")
local errors=$(echo "$status" | grep -A5 "errors:")
local vdev_status=""
local line=""
while IFS= read -r line; do
if echo "$line" | grep -qE '^\s+(ada|da|nvd|nda)'; then
local disk=$(echo "$line" | awk '{print $1}')
local disk_state=$(echo "$line" | awk '{print $2}')
local read_err=$(echo "$line" | awk '{print $3}')
local write_err=$(echo "$line" | awk '{print $4}')
local cksum_err=$(echo "$line" | awk '{print $5}')
local disk_info=$(get_disk_info "$disk")
vdev_status="${vdev_status}${disk_info}\n State: ${disk_state} Errors: R:${read_err} W:${write_err} C:${cksum_err}\n\n"
fi
done <<EOF
$status
EOF
local msg="Pool: $POOL_NAME\n${state}\n\n${scan}\n\n${vdev_status}${errors}"
local action=$(dialog --menu "$msg" 20 75 3 \
"scrub" "Start scrub (verify data integrity)" \
"back" "Back to main menu" \
3>&1 1>&2 2>&3)
case "$action" in
"scrub")
menu_pool_scrub
;;
esac
}
menu_replace_disk() {
log "Replace failed disk selected"
import_pool_rw || return 1
local status=$(zpool status "$POOL_NAME" 2>/dev/null)
local failed_disks=""
local line=""
while IFS= read -r line; do
if echo "$line" | grep -qE '^\s+(ada|da|nvd|nda)'; then
local disk=$(echo "$line" | awk '{print $1}')
local dstate=$(echo "$line" | awk '{print $2}')
if [ "$dstate" != "ONLINE" ]; then
local disk_info=$(get_disk_info "$disk")
failed_disks="${failed_disks}${disk} \"${disk_info} (${dstate})\" "
fi
fi
done <<EOF
$status
EOF
if [ -z "$failed_disks" ]; then
dialog --msgbox "\nNo failed disks detected.\n\nAll disks are ONLINE." 10 50
return 0
fi
local failed_disk=$(dialog --menu \
"Select failed disk to replace:\n\n${failed_disks}" \
15 70 5 \
$failed_disks \
3>&1 1>&2 2>&3)
[ $? -ne 0 ] && return 0
local pool_disks=$(zpool status "$POOL_NAME" 2>/dev/null | grep -E '^\s+(ada|da|nvd|nda)' | awk '{print $1}' | sort -u)
local all_disks=$(camcontrol devlist 2>/dev/null | grep -oE '(ada|da|nvd|nda)[0-9]+' | sort -u)
local available=""
for d in $all_disks; do
if ! echo "$pool_disks" | grep -qw "$d"; then
local info=$(get_disk_info $d)
available="${available}${d} \"${info}\" "
fi
done
if [ -z "$available" ]; then
dialog --msgbox "\nNo replacement disks available.\n\nConnect a replacement disk and restart." 10 50
return 1
fi
local replacement=$(dialog --menu \
"Select replacement disk for ${failed_disk}:" \
15 70 10 \
$available \
3>&1 1>&2 2>&3)
[ $? -ne 0 ] && return 0
local failed_size=$(diskinfo "$failed_disk" 2>/dev/null | awk '{print $3}')
local replace_size=$(diskinfo "$replacement" 2>/dev/null | awk '{print $3}')
if [ "$replace_size" -lt "$failed_size" ]; then
dialog --msgbox "\nReplacement disk is smaller than failed disk.\n\nUse a disk of equal or larger size." 10 50
return 1
fi
local confirm=$(dialog --yesno \
"\nConfirm disk replacement:\n\n ${failed_disk}${replacement}\n\nThis will take 4-8 hours.\nPool remains usable during resilver." \
12 60)
[ "$confirm" != "yes" ] && return 0
log "Replacing $failed_disk with $replacement"
zpool replace "$POOL_NAME" "$failed_disk" "$replacement" 2>&1 | tee -a "$LOG"
dialog --msgbox \
"\nDisk replacement started.\n\nMonitor progress:\n zpool status $POOL_NAME\n\nPool remains usable.\nDo not power off until resilver completes." \
12 60
}
menu_pool_scrub() {
log "Pool scrub selected"
import_pool_rw || return 1
local status=$(zpool status "$POOL_NAME" 2>/dev/null)
local last_scrub=$(echo "$status" | grep -A1 "scan:" | grep "scrub" | head -1)
local action=$(dialog --menu \
"Pool Scrub\n\nLast scrub:\n${last_scrub:-None}\n\nScrub reads all data and verifies checksums.\nRepairs silent corruption if redundancy exists." \
15 60 3 \
"start" "Start scrub now" \
"status" "Check scrub status" \
"stop" "Stop running scrub" \
"back" "Back to main menu" \
3>&1 1>&2 2>&3)
case "$action" in
"start")
zpool scrub "$POOL_NAME" 2>&1 | tee -a "$LOG"
dialog --msgbox "\nScrub started.\n\nPool remains usable during scrub.\nMonitor: zpool status $POOL_NAME" 10 50
;;
"status")
local scrub_status=$(zpool status "$POOL_NAME" 2>/dev/null | grep -A5 "scan:")
dialog --msgbox "${scrub_status}" 15 60
;;
"stop")
local confirm=$(dialog --yesno "\nStop running scrub?\n\nThis is safe but not recommended." 10 50)
[ "$confirm" = "yes" ] && zpool scrub -s "$POOL_NAME" 2>&1 | tee -a "$LOG"
;;
esac
}
menu_disk_cleanup() {
log "Disk space cleanup selected"
import_pool_rw || return 1
local used=$(zfs list -Hp -o used "$POOL_NAME" 2>/dev/null)
local avail=$(zfs list -Hp -o avail "$POOL_NAME" 2>/dev/null)
local total=$((used + avail))
local used_gb=$((used / 1073741824))
local total_gb=$((total / 1073741824))
local percent=$((used * 100 / total))
local action=$(dialog --menu \
"Disk Space Cleanup\n\nPool: $POOL_NAME\nUsed: ${used_gb} GB / ${total_gb} GB (${percent}%)\n\nSelect cleanup type:" \
18 70 6 \
"quick" "Quick cleanup (pkg cache, npm cache, old logs)" \
"snapshots" "Snapshot cleanup (delete old snapshots)" \
"be" "Boot environment cleanup (delete old BEs)" \
"large" "Find large files (>100MB)" \
"analyze" "Analyze disk usage by dataset" \
"back" "Back to main menu" \
3>&1 1>&2 2>&3)
case "$action" in
"quick")
local confirm=$(dialog --yesno \
"\nQuick cleanup will remove:\n\n - Package cache (/var/cache/pkg)\n - npm cache (~/.npm)\n - Old logs (>7 days)\n - Temporary files\n - Crash dumps\n\nContinue?" \
12 60)
if [ "$confirm" = "yes" ]; then
log "Running quick cleanup"
rm -rf /var/cache/pkg/* 2>/dev/null
rm -rf /home/*/.npm 2>/dev/null
find /var/log -type f -mtime +7 -delete 2>/dev/null
rm -rf /tmp/* 2>/dev/null
rm -rf /var/tmp/* 2>/dev/null
rm -rf /var/crash/* 2>/dev/null
dialog --msgbox "\nQuick cleanup complete." 8 40
fi
;;
"snapshots")
local snapshots=$(zfs list -t snapshot -r -H -o name "$POOL_NAME" 2>/dev/null)
local snapshot_count=$(echo "$snapshots" | wc -l)
local confirm=$(dialog --yesno \
"\nFound ${snapshot_count} snapshots.\n\nDelete snapshots older than 30 days?" \
10 50)
if [ "$confirm" = "yes" ]; then
log "Deleting old snapshots"
while read -r snap; do
local creation=$(zfs get -Hp -o value creation "$snap" 2>/dev/null)
local age_days=$(( ($(date +%s) - creation) / 86400 ))
if [ "$age_days" -gt 30 ]; then
zfs destroy "$snap" 2>&1 | tee -a "$LOG"
log "Deleted snapshot: $snap (age: ${age_days} days)"
fi
done <<EOF
$snapshots
EOF
dialog --msgbox "\nSnapshot cleanup complete." 8 40
fi
;;
"be")
local bes=$(bectl list -H 2>/dev/null | awk '{print $1}')
local active=$(bectl list 2>/dev/null | grep -E '^\S+\s+.*\s+.*\s+R' | awk '{print $1}')
dialog --msgbox \
"\nBoot Environments\n\nActive: ${active:-unknown}\n\nAvailable:\n${bes}\n\nUse 'bectl destroy <name>' to remove old BEs." \
15 60
;;
"large")
dialog --infobox "Scanning for large files..." 5 50
local large_files=$(find / -type f -size +100M 2>/dev/null | head -20)
dialog --msgbox \
"\nLarge files (>100MB):\n\n${large_files:-None found}\n\nReview and delete manually if needed." \
20 70
;;
"analyze")
local datasets=$(zfs list -r -H -o name,used "$POOL_NAME" 2>/dev/null | sort -k2 -rn | head -15)
dialog --msgbox "\nDatasets by size:\n\n${datasets}" 20 70
;;
esac
}
menu_be_manager() {
log "Boot environment manager selected"
import_pool_rw || return 1
local bes=$(bectl list -H 2>/dev/null)
local active=$(bectl list 2>/dev/null | grep -E '^\S+\s+.*\s+.*\s+R' | awk '{print $1}')
local action=$(dialog --menu \
"Boot Environments\n\nActive: ${active:-unknown}\n\nSelect action:" \
15 60 4 \
"list" "List all boot environments" \
"create" "Create new boot environment" \
"activate" "Activate a boot environment" \
"destroy" "Destroy a boot environment" \
"back" "Back to main menu" \
3>&1 1>&2 2>&3)
case "$action" in
"list")
dialog --msgbox "$(bectl list 2>/dev/null)" 15 70
;;
"create")
local name=$(dialog --inputbox "New boot environment name:" 8 50 "" 3>&1 1>&2 2>&3)
[ -n "$name" ] && bectl create "$name" 2>&1 | tee -a "$LOG"
;;
"activate")
local be_list=$(bectl list -H 2>/dev/null | awk '{print $1}')
local name=$(dialog --menu "Select BE to activate:" 15 60 10 $(echo "$be_list" | while read -r be; do echo "$be \"\""; done) 3>&1 1>&2 2>&3)
[ -n "$name" ] && bectl activate "$name" 2>&1 | tee -a "$LOG"
;;
"destroy")
local be_list=$(bectl list -H 2>/dev/null | awk '{print $1}')
local name=$(dialog --menu "Select BE to destroy:" 15 60 10 $(echo "$be_list" | grep -v "^${active}$" | while read -r be; do echo "$be \"\""; done) 3>&1 1>&2 2>&3)
if [ -n "$name" ] && [ "$name" != "$active" ]; then
local confirm=$(dialog --yesno "\nDestroy boot environment '$name'?\n\nThis cannot be undone." 10 50)
[ "$confirm" = "yes" ] && bectl destroy "$name" 2>&1 | tee -a "$LOG"
fi
;;
esac
[ "$action" != "back" ] && menu_be_manager
}
menu_pool_migrate() {
log "Pool migration selected"
exec "${SHARE}/firstboot/zfs-pool-migrate.sh"
}
menu_import_ro() {
log "Import read-only selected"
import_pool_ro
dialog --msgbox \
"\nPool '$POOL_NAME' imported read-only.\n\nData is accessible at:\n /$POOL_NAME/\n\nUseful for data recovery.\n\nExport when done:\n zpool export "$POOL_NAME"" \
15 60
/bin/sh
}
main_menu() {
local pool_status=""
local pool_detected=""
if zpool list "$POOL_NAME" >/dev/null 2>&1; then
pool_detected="yes"
pool_status=$(zpool status "$POOL_NAME" 2>/dev/null | grep "state:" | head -1 | awk -F: '{print $2}')
fi
local menu_items=""
if [ "$pool_detected" = "yes" ]; then
menu_items="\"1\" \"Pool Health Check — Status: ${pool_status}\" "
menu_items="${menu_items}\"2\" \"Replace Failed Disk\" "
menu_items="${menu_items}\"3\" \"Scrub Pool\" "
menu_items="${menu_items}\"4\" \"Disk Space Cleanup\" "
menu_items="${menu_items}\"5\" \"Boot Environments\" "
menu_items="${menu_items}\"6\" \"Pool Migration\" "
menu_items="${menu_items}\"7\" \"Import Read-Only\" "
menu_items="${menu_items}\"8\" \"Export Pool & Reboot\" "
else
menu_items="\"1\" \"Import Pool (read-write)\" "
menu_items="${menu_items}\"2\" \"Import Pool (read-only)\" "
menu_items="${menu_items}\"3\" \"Shell\" "
menu_items="${menu_items}\"4\" \"Reboot\" "
fi
local choice=$(dialog --menu \
"Clawdie Maintenance Mode\n\nPool: ${POOL_NAME:-Not imported}\n" \
18 70 8 \
$menu_items \
3>&1 1>&2 2>&3)
case "$choice" in
"1")
if [ "$pool_detected" = "yes" ]; then
menu_pool_health
else
import_pool_rw && main_menu
fi
;;
"2")
if [ "$pool_detected" = "yes" ]; then
menu_replace_disk
else
import_pool_ro && main_menu
fi
;;
"3")
if [ "$pool_detected" = "yes" ]; then
menu_pool_scrub
else
exec /bin/sh
fi
;;
"4")
if [ "$pool_detected" = "yes" ]; then
menu_disk_cleanup
else
reboot
fi
;;
"5")
menu_be_manager
;;
"6")
menu_pool_migrate
;;
"7")
menu_import_ro
;;
"8")
export_pool
reboot
;;
esac
main_menu
}
main() {
log "Maintenance mode started"
kldload zfs 2>/dev/null || true
dialog --msgbox \
"\nClawdie Maintenance Mode\n\nThis mode provides tools for:\n - Pool health monitoring\n - Disk replacement\n - Data scrubbing\n - Space cleanup\n - Pool migration\n\nNo changes will be made without confirmation." \
15 60
main_menu
}
main "$@"