Update 2025-04-13_15:16:39
This commit is contained in:
43
wrap_embeddings.sh
Executable file
43
wrap_embeddings.sh
Executable file
@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
# Script Version: 0.3
|
||||
# Description: Convert each .txt in content/ to .json with embedding in json/
|
||||
|
||||
# Set variables
|
||||
CONTENT_DIR="./content"
|
||||
JSON_DIR="./json"
|
||||
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
||||
|
||||
# Check dependencies
|
||||
if ! python3 -c "import sentence_transformers" 2>/dev/null; then
|
||||
echo "[ERROR] ❌ sentence-transformers not installed. Run: pip3 install sentence-transformers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check input files
|
||||
mkdir -p "$JSON_DIR"
|
||||
if [ ! -d "$CONTENT_DIR" ] || ! ls "$CONTENT_DIR"/*.txt >/dev/null 2>&1; then
|
||||
echo "[ERROR] ❌ No .txt files found in $CONTENT_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Generate embeddings
|
||||
python3 -c "
|
||||
import sys, json, os
|
||||
from sentence_transformers import SentenceTransformer
|
||||
content_dir, json_dir = sys.argv[1], sys.argv[2]
|
||||
model = SentenceTransformer('${EMBEDDING_MODEL}')
|
||||
for txt_file in os.listdir(content_dir):
|
||||
if txt_file.endswith('.txt'):
|
||||
base_name = txt_file[:-4]
|
||||
try:
|
||||
with open(os.path.join(content_dir, txt_file), 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
embedding = model.encode([text])[0].tolist()
|
||||
with open(os.path.join(json_dir, f'{base_name}.json'), 'w') as f:
|
||||
json.dump({'id': base_name, 'text': text, 'embedding': embedding}, f)
|
||||
print(f'[DEBUG] ✅ Saved: {json_dir}/{base_name}.json')
|
||||
except Exception as e:
|
||||
print(f'[ERROR] ❌ Failed: {txt_file} - {str(e)}', file=sys.stderr)
|
||||
" "$CONTENT_DIR" "$JSON_DIR" 2>&1 | while read -r line; do echo "$line"; done
|
||||
|
||||
echo "✅ All .txt files converted to JSON with embeddings in $JSON_DIR"
|
Reference in New Issue
Block a user