Update 2025-04-13_16:05:19
This commit is contained in:
40
generate_embeddings.py
Normal file
40
generate_embeddings.py
Normal file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
# Script Version: 01
|
||||
# Description: Generate embeddings from text files using GPU (if available)
|
||||
|
||||
import os
|
||||
import torch
|
||||
import json
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Set variables
|
||||
# ========
|
||||
CONTENT_DIR = "content"
|
||||
OUTPUT_FILE = "embeddings.json"
|
||||
|
||||
# Initialize the embedding model and move to GPU if available
|
||||
model = SentenceTransformer("all-mpnet-base-v2")
|
||||
if torch.cuda.is_available():
|
||||
model = model.to("cuda")
|
||||
print("[INFO] GPU detected: Model running on GPU")
|
||||
else:
|
||||
print("[INFO] No GPU detected: Model running on CPU")
|
||||
|
||||
# Generate embeddings
|
||||
# ========
|
||||
embedding_data = {}
|
||||
|
||||
for filename in os.listdir(CONTENT_DIR):
|
||||
if filename.endswith(".txt"):
|
||||
filepath = os.path.join(CONTENT_DIR, filename)
|
||||
with open(filepath, "r", encoding="utf-8") as file:
|
||||
text = file.read().strip()
|
||||
embedding = model.encode(text)
|
||||
embedding_data[filename] = embedding.tolist()
|
||||
|
||||
# Save embeddings to JSON
|
||||
with open(OUTPUT_FILE, "w") as json_file:
|
||||
json.dump(embedding_data, json_file, indent=4)
|
||||
|
||||
print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")
|
||||
|
Reference in New Issue
Block a user