Three scripts in scripts/memory/: - common.sh: shared config, DB connection, dependency checks - embed.sh: generate 1536d embeddings via OpenRouter API - chunk.sh: split text on sentence boundaries (~500 chars) - memory-pg.sh: full CLI for store/search/recent/important/count Store pipeline: text → chunk → embed → insert (fully automated). Search: hybrid RRF combining full-text and vector similarity. All scripts use #!/usr/bin/env bash for FreeBSD compatibility. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
71 lines
1.7 KiB
Bash
Executable file
71 lines
1.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# chunk.sh — Split text into chunks for embedding
|
|
#
|
|
# Usage:
|
|
# ./chunk.sh "long text to split"
|
|
# echo "long text" | ./chunk.sh -
|
|
# ./chunk.sh - < file.txt
|
|
#
|
|
# Output: One chunk per line (newline-separated)
|
|
# Splits on sentence boundaries, max ~500 chars per chunk.
|
|
|
|
. "$(dirname "$0")/common.sh"
|
|
|
|
# Read input
|
|
if [ "${1:-}" = "-" ]; then
|
|
INPUT_TEXT=$(cat)
|
|
elif [ -n "${1:-}" ]; then
|
|
INPUT_TEXT="$1"
|
|
else
|
|
echo "Usage: chunk.sh <text> | chunk.sh -" >&2
|
|
exit 1
|
|
fi
|
|
|
|
python3 -c "
|
|
import sys, re
|
|
|
|
text = sys.argv[1].strip()
|
|
max_chars = int(sys.argv[2])
|
|
|
|
if not text:
|
|
sys.exit(0)
|
|
|
|
# If short enough, single chunk
|
|
if len(text) <= max_chars:
|
|
print(text)
|
|
sys.exit(0)
|
|
|
|
# Split on sentence boundaries
|
|
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
|
|
chunks = []
|
|
current = ''
|
|
|
|
for sentence in sentences:
|
|
# If adding this sentence exceeds limit, flush current chunk
|
|
if current and len(current) + len(sentence) + 1 > max_chars:
|
|
chunks.append(current.strip())
|
|
current = sentence
|
|
else:
|
|
current = (current + ' ' + sentence).strip() if current else sentence
|
|
|
|
# Don't forget the last chunk
|
|
if current.strip():
|
|
chunks.append(current.strip())
|
|
|
|
# Safety: if any chunk is still too long (no sentence breaks), hard-split
|
|
final = []
|
|
for chunk in chunks:
|
|
while len(chunk) > max_chars:
|
|
# Find last space before limit
|
|
split_at = chunk.rfind(' ', 0, max_chars)
|
|
if split_at == -1:
|
|
split_at = max_chars
|
|
final.append(chunk[:split_at].strip())
|
|
chunk = chunk[split_at:].strip()
|
|
if chunk:
|
|
final.append(chunk)
|
|
|
|
for chunk in final:
|
|
print(chunk)
|
|
" "$INPUT_TEXT" "$CHUNK_MAX_CHARS"
|