Files
at1-workstation-scripts/generate_embeddings.py
2025-04-13 16:05:19 +02:00

41 lines
1.1 KiB
Python

#!/usr/bin/env python3
# Script Version: 01
# Description: Generate embeddings from text files using GPU (if available)
import os
import torch
import json
from sentence_transformers import SentenceTransformer
# Set variables
# ========
CONTENT_DIR = "content"
OUTPUT_FILE = "embeddings.json"
# Initialize the embedding model and move to GPU if available
model = SentenceTransformer("all-mpnet-base-v2")
if torch.cuda.is_available():
model = model.to("cuda")
print("[INFO] GPU detected: Model running on GPU")
else:
print("[INFO] No GPU detected: Model running on CPU")
# Generate embeddings
# ========
embedding_data = {}
for filename in os.listdir(CONTENT_DIR):
if filename.endswith(".txt"):
filepath = os.path.join(CONTENT_DIR, filename)
with open(filepath, "r", encoding="utf-8") as file:
text = file.read().strip()
embedding = model.encode(text)
embedding_data[filename] = embedding.tolist()
# Save embeddings to JSON
with open(OUTPUT_FILE, "w") as json_file:
json.dump(embedding_data, json_file, indent=4)
print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")