Update 2025-04-13_16:05:19
This commit is contained in:
79
semantic_search.py
Normal file
79
semantic_search.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
# Script Version: 0.4
|
||||
# Description: Semantic search over local embeddings.json with content preview and optional file copy
|
||||
|
||||
import json
|
||||
import torch
|
||||
import os
|
||||
import shutil
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# Config
|
||||
EMBEDDING_FILE = "embeddings.json"
|
||||
CONTENT_DIR = "content"
|
||||
RESULTS_DIR = "results"
|
||||
MODEL_NAME = "all-mpnet-base-v2"
|
||||
PREVIEW_LINES = 5 # Number of lines to preview from the matching .txt files
|
||||
|
||||
# Ensure results directory exists
|
||||
os.makedirs(RESULTS_DIR, exist_ok=True)
|
||||
|
||||
# Load model
|
||||
model = SentenceTransformer(MODEL_NAME)
|
||||
if torch.cuda.is_available():
|
||||
model = model.to("cuda")
|
||||
print("[INFO] Running on GPU")
|
||||
|
||||
# Load stored embeddings
|
||||
with open(EMBEDDING_FILE, "r") as f:
|
||||
stored_embeddings = json.load(f)
|
||||
|
||||
# Prompt user
|
||||
query = input("\U0001F50D Enter your search query: ").strip()
|
||||
|
||||
# Embed query
|
||||
query_embedding = model.encode(query)
|
||||
|
||||
# Compute cosine similarities
|
||||
results = []
|
||||
for filename, embedding in stored_embeddings.items():
|
||||
score = cosine_similarity([query_embedding], [embedding])[0][0]
|
||||
results.append((filename, score))
|
||||
|
||||
# Sort and display top result(s)
|
||||
results.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
copied_files = []
|
||||
|
||||
print("\n\U0001F4C2 Top matches:")
|
||||
for fname, score in results[:3]:
|
||||
print(f"\n{fname} → score: {score:.4f}")
|
||||
txt_path = os.path.join(CONTENT_DIR, fname)
|
||||
if os.path.exists(txt_path):
|
||||
print("Preview:")
|
||||
with open(txt_path, "r", encoding="utf-8") as f:
|
||||
for i, line in enumerate(f):
|
||||
print(" " + line.strip())
|
||||
if i + 1 >= PREVIEW_LINES:
|
||||
break
|
||||
|
||||
# Ask user if they want to copy the file
|
||||
should_copy = input(f"📄 Copy '{fname}' to '{RESULTS_DIR}'? [y/N]: ").strip().lower()
|
||||
if should_copy == "y":
|
||||
dest_path = os.path.join(RESULTS_DIR, fname)
|
||||
shutil.copyfile(txt_path, dest_path)
|
||||
copied_files.append(fname)
|
||||
print(f"[INFO] File copied to {dest_path}")
|
||||
else:
|
||||
print("[WARN] Source file not found for preview.")
|
||||
|
||||
# Final summary
|
||||
if copied_files:
|
||||
print("\n✅ Summary of copied files:")
|
||||
for f in copied_files:
|
||||
print(f" - {f}")
|
||||
else:
|
||||
print("\nℹ️ No files were copied.")
|
||||
|
Reference in New Issue
Block a user