44 lines
1.6 KiB
Bash
Executable File
44 lines
1.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Script Version: 0.3
|
|
# Description: Convert each .txt in content/ to .json with embedding in json/
|
|
|
|
# Set variables
|
|
CONTENT_DIR="./content"
|
|
JSON_DIR="./json"
|
|
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
|
|
|
# Check dependencies
|
|
if ! python3 -c "import sentence_transformers" 2>/dev/null; then
|
|
echo "[ERROR] ❌ sentence-transformers not installed. Run: pip3 install sentence-transformers"
|
|
exit 1
|
|
fi
|
|
|
|
# Check input files
|
|
mkdir -p "$JSON_DIR"
|
|
if [ ! -d "$CONTENT_DIR" ] || ! ls "$CONTENT_DIR"/*.txt >/dev/null 2>&1; then
|
|
echo "[ERROR] ❌ No .txt files found in $CONTENT_DIR"
|
|
exit 1
|
|
fi
|
|
|
|
# Generate embeddings
|
|
python3 -c "
|
|
import sys, json, os
|
|
from sentence_transformers import SentenceTransformer
|
|
content_dir, json_dir = sys.argv[1], sys.argv[2]
|
|
model = SentenceTransformer('${EMBEDDING_MODEL}')
|
|
for txt_file in os.listdir(content_dir):
|
|
if txt_file.endswith('.txt'):
|
|
base_name = txt_file[:-4]
|
|
try:
|
|
with open(os.path.join(content_dir, txt_file), 'r', encoding='utf-8') as f:
|
|
text = f.read()
|
|
embedding = model.encode([text])[0].tolist()
|
|
with open(os.path.join(json_dir, f'{base_name}.json'), 'w') as f:
|
|
json.dump({'id': base_name, 'text': text, 'embedding': embedding}, f)
|
|
print(f'[DEBUG] ✅ Saved: {json_dir}/{base_name}.json')
|
|
except Exception as e:
|
|
print(f'[ERROR] ❌ Failed: {txt_file} - {str(e)}', file=sys.stderr)
|
|
" "$CONTENT_DIR" "$JSON_DIR" 2>&1 | while read -r line; do echo "$line"; done
|
|
|
|
echo "✅ All .txt files converted to JSON with embeddings in $JSON_DIR"
|