41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
#!/usr/bin/env python3
|
|
# Script Version: 01
|
|
# Description: Generate embeddings from text files using GPU (if available)
|
|
|
|
import os
|
|
import torch
|
|
import json
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
# Set variables
|
|
# ========
|
|
CONTENT_DIR = "content"
|
|
OUTPUT_FILE = "embeddings.json"
|
|
|
|
# Initialize the embedding model and move to GPU if available
|
|
model = SentenceTransformer("all-mpnet-base-v2")
|
|
if torch.cuda.is_available():
|
|
model = model.to("cuda")
|
|
print("[INFO] GPU detected: Model running on GPU")
|
|
else:
|
|
print("[INFO] No GPU detected: Model running on CPU")
|
|
|
|
# Generate embeddings
|
|
# ========
|
|
embedding_data = {}
|
|
|
|
for filename in os.listdir(CONTENT_DIR):
|
|
if filename.endswith(".txt"):
|
|
filepath = os.path.join(CONTENT_DIR, filename)
|
|
with open(filepath, "r", encoding="utf-8") as file:
|
|
text = file.read().strip()
|
|
embedding = model.encode(text)
|
|
embedding_data[filename] = embedding.tolist()
|
|
|
|
# Save embeddings to JSON
|
|
with open(OUTPUT_FILE, "w") as json_file:
|
|
json.dump(embedding_data, json_file, indent=4)
|
|
|
|
print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")
|
|
|