#!/usr/bin/env python3 # Script Version: 0.4 # Description: Semantic search over local embeddings.json with content preview and optional file copy import json import torch import os import shutil import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity # Config EMBEDDING_FILE = "embeddings.json" CONTENT_DIR = "content" RESULTS_DIR = "results" MODEL_NAME = "all-mpnet-base-v2" PREVIEW_LINES = 5 # Number of lines to preview from the matching .txt files # Ensure results directory exists os.makedirs(RESULTS_DIR, exist_ok=True) # Load model model = SentenceTransformer(MODEL_NAME) if torch.cuda.is_available(): model = model.to("cuda") print("[INFO] Running on GPU") # Load stored embeddings with open(EMBEDDING_FILE, "r") as f: stored_embeddings = json.load(f) # Prompt user query = input("\U0001F50D Enter your search query: ").strip() # Embed query query_embedding = model.encode(query) # Compute cosine similarities results = [] for filename, embedding in stored_embeddings.items(): score = cosine_similarity([query_embedding], [embedding])[0][0] results.append((filename, score)) # Sort and display top result(s) results.sort(key=lambda x: x[1], reverse=True) copied_files = [] print("\n\U0001F4C2 Top matches:") for fname, score in results[:3]: print(f"\n{fname} → score: {score:.4f}") txt_path = os.path.join(CONTENT_DIR, fname) if os.path.exists(txt_path): print("Preview:") with open(txt_path, "r", encoding="utf-8") as f: for i, line in enumerate(f): print(" " + line.strip()) if i + 1 >= PREVIEW_LINES: break # Ask user if they want to copy the file should_copy = input(f"šŸ“„ Copy '{fname}' to '{RESULTS_DIR}'? [y/N]: ").strip().lower() if should_copy == "y": dest_path = os.path.join(RESULTS_DIR, fname) shutil.copyfile(txt_path, dest_path) copied_files.append(fname) print(f"[INFO] File copied to {dest_path}") else: print("[WARN] Source file not found for preview.") # Final summary if copied_files: print("\nāœ… Summary of copied files:") for f in copied_files: print(f" - {f}") else: print("\nā„¹ļø No files were copied.")