clawdie-ai/scripts/memory/chunk.sh

#!/usr/bin/env bash
# chunk.sh — Split text into chunks for embedding
#
# Usage:
#   ./chunk.sh "long text to split"
#   echo "long text" | ./chunk.sh -
#   ./chunk.sh - < file.txt
#
# Output: One chunk per line (newline-separated)
# Splits on sentence boundaries, max ~500 chars per chunk.

. "$(dirname "$0")/common.sh"

# Read input
if [ "${1:-}" = "-" ]; then
    INPUT_TEXT=$(cat)
elif [ -n "${1:-}" ]; then
    INPUT_TEXT="$1"
else
    echo "Usage: chunk.sh <text> | chunk.sh -" >&2
    exit 1
fi

python3 -c "
import sys, re

text = sys.argv[1].strip()
max_chars = int(sys.argv[2])

if not text:
    sys.exit(0)

# If short enough, single chunk
if len(text) <= max_chars:
    print(text)
    sys.exit(0)

# Split on sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)

chunks = []
current = ''

for sentence in sentences:
    # If adding this sentence exceeds limit, flush current chunk
    if current and len(current) + len(sentence) + 1 > max_chars:
        chunks.append(current.strip())
        current = sentence
    else:
        current = (current + ' ' + sentence).strip() if current else sentence

# Don't forget the last chunk
if current.strip():
    chunks.append(current.strip())

# Safety: if any chunk is still too long (no sentence breaks), hard-split
final = []
for chunk in chunks:
    while len(chunk) > max_chars:
        # Find last space before limit
        split_at = chunk.rfind(' ', 0, max_chars)
        if split_at == -1:
            split_at = max_chars
        final.append(chunk[:split_at].strip())
        chunk = chunk[split_at:].strip()
    if chunk:
        final.append(chunk)

for chunk in final:
    print(chunk)
" "$INPUT_TEXT" "$CHUNK_MAX_CHARS"