at1-workstation-scripts/generate_embeddings.py

#!/usr/bin/env python3
# Script Version: 01
# Description: Generate embeddings from text files using GPU (if available)

import os
import torch
import json
from sentence_transformers import SentenceTransformer

# Set variables
# ========
CONTENT_DIR = "content"
OUTPUT_FILE = "embeddings.json"

# Initialize the embedding model and move to GPU if available
model = SentenceTransformer("all-mpnet-base-v2")
if torch.cuda.is_available():
    model = model.to("cuda")
    print("[INFO] GPU detected: Model running on GPU")
else:
    print("[INFO] No GPU detected: Model running on CPU")

# Generate embeddings
# ========
embedding_data = {}

for filename in os.listdir(CONTENT_DIR):
    if filename.endswith(".txt"):
        filepath = os.path.join(CONTENT_DIR, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read().strip()
            embedding = model.encode(text)
            embedding_data[filename] = embedding.tolist()

# Save embeddings to JSON
with open(OUTPUT_FILE, "w") as json_file:
    json.dump(embedding_data, json_file, indent=4)

print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")