Files
at1-server-scripts/wrap_embeddings.sh
2025-04-13 15:16:39 +02:00

44 lines
1.6 KiB
Bash
Executable File

#!/bin/bash
# Script Version: 0.3
# Description: Convert each .txt in content/ to .json with embedding in json/
# Set variables
CONTENT_DIR="./content"
JSON_DIR="./json"
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
# Check dependencies
if ! python3 -c "import sentence_transformers" 2>/dev/null; then
echo "[ERROR] ❌ sentence-transformers not installed. Run: pip3 install sentence-transformers"
exit 1
fi
# Check input files
mkdir -p "$JSON_DIR"
if [ ! -d "$CONTENT_DIR" ] || ! ls "$CONTENT_DIR"/*.txt >/dev/null 2>&1; then
echo "[ERROR] ❌ No .txt files found in $CONTENT_DIR"
exit 1
fi
# Generate embeddings
python3 -c "
import sys, json, os
from sentence_transformers import SentenceTransformer
content_dir, json_dir = sys.argv[1], sys.argv[2]
model = SentenceTransformer('${EMBEDDING_MODEL}')
for txt_file in os.listdir(content_dir):
if txt_file.endswith('.txt'):
base_name = txt_file[:-4]
try:
with open(os.path.join(content_dir, txt_file), 'r', encoding='utf-8') as f:
text = f.read()
embedding = model.encode([text])[0].tolist()
with open(os.path.join(json_dir, f'{base_name}.json'), 'w') as f:
json.dump({'id': base_name, 'text': text, 'embedding': embedding}, f)
print(f'[DEBUG] ✅ Saved: {json_dir}/{base_name}.json')
except Exception as e:
print(f'[ERROR] ❌ Failed: {txt_file} - {str(e)}', file=sys.stderr)
" "$CONTENT_DIR" "$JSON_DIR" 2>&1 | while read -r line; do echo "$line"; done
echo "✅ All .txt files converted to JSON with embeddings in $JSON_DIR"