#!/usr/bin/env bash # chunk.sh — Split text into chunks for embedding # # Usage: # ./chunk.sh "long text to split" # echo "long text" | ./chunk.sh - # ./chunk.sh - < file.txt # # Output: One chunk per line (newline-separated) # Splits on sentence boundaries, max ~500 chars per chunk. . "$(dirname "$0")/common.sh" # Read input if [ "${1:-}" = "-" ]; then INPUT_TEXT=$(cat) elif [ -n "${1:-}" ]; then INPUT_TEXT="$1" else echo "Usage: chunk.sh | chunk.sh -" >&2 exit 1 fi python3 -c " import sys, re text = sys.argv[1].strip() max_chars = int(sys.argv[2]) if not text: sys.exit(0) # If short enough, single chunk if len(text) <= max_chars: print(text) sys.exit(0) # Split on sentence boundaries sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current = '' for sentence in sentences: # If adding this sentence exceeds limit, flush current chunk if current and len(current) + len(sentence) + 1 > max_chars: chunks.append(current.strip()) current = sentence else: current = (current + ' ' + sentence).strip() if current else sentence # Don't forget the last chunk if current.strip(): chunks.append(current.strip()) # Safety: if any chunk is still too long (no sentence breaks), hard-split final = [] for chunk in chunks: while len(chunk) > max_chars: # Find last space before limit split_at = chunk.rfind(' ', 0, max_chars) if split_at == -1: split_at = max_chars final.append(chunk[:split_at].strip()) chunk = chunk[split_at:].strip() if chunk: final.append(chunk) for chunk in final: print(chunk) " "$INPUT_TEXT" "$CHUNK_MAX_CHARS"