From a10ca4fceffd8e278b0d00f90c36adef16c7b5c4 Mon Sep 17 00:00:00 2001 From: oib Date: Sun, 13 Apr 2025 16:05:19 +0200 Subject: [PATCH] Update 2025-04-13_16:05:19 --- generate_embeddings.py | 40 ++++++++ gitea_push.sh | 219 +++++++++++++++++++++++++++++++++++++++++ import_embeddings.py | 38 +++++++ semantic_search.py | 79 +++++++++++++++ 4 files changed, 376 insertions(+) create mode 100644 generate_embeddings.py create mode 100755 gitea_push.sh create mode 100644 import_embeddings.py create mode 100644 semantic_search.py diff --git a/generate_embeddings.py b/generate_embeddings.py new file mode 100644 index 0000000..ee5923f --- /dev/null +++ b/generate_embeddings.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# Script Version: 01 +# Description: Generate embeddings from text files using GPU (if available) + +import os +import torch +import json +from sentence_transformers import SentenceTransformer + +# Set variables +# ======== +CONTENT_DIR = "content" +OUTPUT_FILE = "embeddings.json" + +# Initialize the embedding model and move to GPU if available +model = SentenceTransformer("all-mpnet-base-v2") +if torch.cuda.is_available(): + model = model.to("cuda") + print("[INFO] GPU detected: Model running on GPU") +else: + print("[INFO] No GPU detected: Model running on CPU") + +# Generate embeddings +# ======== +embedding_data = {} + +for filename in os.listdir(CONTENT_DIR): + if filename.endswith(".txt"): + filepath = os.path.join(CONTENT_DIR, filename) + with open(filepath, "r", encoding="utf-8") as file: + text = file.read().strip() + embedding = model.encode(text) + embedding_data[filename] = embedding.tolist() + +# Save embeddings to JSON +with open(OUTPUT_FILE, "w") as json_file: + json.dump(embedding_data, json_file, indent=4) + +print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}") + diff --git a/gitea_push.sh b/gitea_push.sh new file mode 100755 index 0000000..0bca28e --- /dev/null +++ b/gitea_push.sh @@ -0,0 +1,219 @@ +#!/bin/zsh +# Script Version: 1.5 +# Description: Pushes the current folder (e.g. /etc) to a nested Gitea repo using provided nesting arguments. Auto-creates the remote repo via Gitea API if missing. + +# Set variables +# ======== + +# Try to extract GITEA_API_TOKEN from ~/.gitea_token if not set +if [ -z "$GITEA_API_TOKEN" ] && [ -f "$HOME/.gitea_token" ]; then + GITEA_API_TOKEN=$(<"$HOME/.gitea_token") + export GITEA_API_TOKEN +fi + +GITEA_USER=$(awk '{for(i=1;i<=NF;i++) if($i=="login") print $(i+1)}' ~/.netrc | head -n1) +if [ -z "$GITEA_USER" ]; then + echo "[ERROR] No login found in ~/.netrc" + exit 1 +fi + +GITEA_URL="https://$(awk '{for(i=1;i<=NF;i++) if($i=="machine") print $(i+1)}' ~/.netrc | head -n1)" +if [ -z "$GITEA_URL" ]; then + echo "[ERROR] No URL found in ~/.netrc" + exit 1 +fi +GITEA_API_URL="$GITEA_URL/api/v1" + +PRIVATE=false +DEBUG=false +COMMIT_MESSAGE="Update $(date +"%F_%T")" + +# Logging function +# ======== +log() { + local level="$1"; shift + if [[ "$level" == "DEBUG" && "$DEBUG" != true ]]; then return; fi + local color_reset="$(tput sgr0)" + local color="" + case "$level" in + INFO) color="$(tput setaf 2)" ;; # green + WARNING) color="$(tput setaf 3)" ;; # yellow + ERROR) color="$(tput setaf 1)" ;; # red + DEBUG) color="$(tput setaf 4)" ;; # blue + esac + echo "${color}[$level] $*${color_reset}" +} + +# Functions +# ======== +create_repo() { + log INFO "Repository does not exist. Creating via API: $REMOTE_PATH" + log DEBUG "POST $GITEA_API_URL/user/repos with name=$REMOTE_PATH and private=$PRIVATE" + RESPONSE=$(curl -s -X POST \ + -H "Authorization: token $GITEA_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"name\": \"$FOLDER_NAME\", \"private\": $PRIVATE}" \ + "$GITEA_API_URL/user/repos") + + if echo "$RESPONSE" | grep -q '"clone_url"'; then + log INFO "Remote repository created successfully." + HTTP_STATUS=200 + else + log ERROR "Failed to create remote repository: $RESPONSE" + exit 1 + fi +} + +prepare_commit() { + git add . + if git diff --quiet HEAD && ! git rev-parse --verify HEAD >/dev/null 2>&1; then + log INFO "Creating initial commit" + git commit -m "$COMMIT_MESSAGE" + elif ! git diff --quiet HEAD; then + log INFO "Committing changes" + git commit -m "$COMMIT_MESSAGE" + else + log INFO "Nothing to commit" + fi +} + +setup_remote() { + if git remote | grep -q '^origin$'; then + log INFO "Updating remote origin URL" + git remote set-url origin "$GIT_REMOTE" + else + log INFO "Adding remote origin" + git remote add origin "$GIT_REMOTE" + fi +} + +push_changes() { + log INFO "Pushing to $GIT_REMOTE" + git push -u origin main +} + +# Show help if no arguments are given +# ======== +if [ $# -eq 0 ]; then + echo "GITEA_API_TOKEN=" + echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] " + echo "Example: $0 server" + echo " $0 --private workstation" + echo " $0 --debug server" + echo " $0 --message \"minor update\" server" + echo + echo "Note: You must cd into the target folder before running this script." + echo "For example:" + echo " cd /etc && $0 server" + echo + echo "Authentication:" + echo " Git operations (clone, push, pull) use ~/.netrc with your Git password:" + echo " machine \$(echo \"$GITEA_URL\" | sed 's|https\?://||') login $GITEA_USER password \"\"" + echo " chmod 600 ~/.netrc" + echo + echo " API operations (e.g. creating repos) use a Personal Access Token stored in ~/.gitea_token" + echo " echo \"\" > ~/.gitea_token && chmod 600 ~/.gitea_token" + exit 0 +fi + +# Parse arguments +# ======== +POSITIONAL_ARGS=() +while [[ $# -gt 0 ]]; do + case "$1" in + --private) + PRIVATE=true + shift + ;; + --debug) + DEBUG=true + shift + ;; + --message) + COMMIT_MESSAGE="$2" + shift 2 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] " + exit 1 +fi + +HOST_GROUP=$(echo "$1" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-') +HOST_NAME=$(hostname -s | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-') +FOLDER_NAME="${HOST_NAME}-${HOST_GROUP}-$(basename "$PWD")" +REPO_PATH="$PWD" +REMOTE_PATH="$FOLDER_NAME" +GIT_REMOTE="$GITEA_URL/$GITEA_USER/$FOLDER_NAME.git" + +# Git authentication hint +log DEBUG "Ensure ~/.netrc has: machine login $GITEA_USER password " + +# Check or create remote repo +check_or_create_repo() { + if [ -z "$GITEA_API_TOKEN" ]; then + log WARNING "GITEA_API_TOKEN is not set. Skipping API repo creation." + return + fi + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + -H "Authorization: token $GITEA_API_TOKEN" \ + "$GITEA_API_URL/repos/$GITEA_USER/$FOLDER_NAME") + + if [ "$HTTP_STATUS" -ne 200 ]; then + create_repo + else + log INFO "Remote repository already exists." + fi +} + + +check_or_create_repo + +# Main Process +# ======== + +# Safety check against pushing from / or $HOME +if [[ "$PWD" == "$HOME" || "$PWD" == "/" ]]; then + log ERROR "Refusing to run inside \$PWD=$PWD" + exit 1 +fi +log INFO "Pushing $REPO_PATH to $GIT_REMOTE" +cd "$REPO_PATH" || { log ERROR "Directory $REPO_PATH not found"; exit 1; } + +# Initialize git if needed +# Branch is fixed to 'main' for simplicity and consistency +if [ ! -d .git ]; then + log INFO "Initializing Git repo" + git init + git config init.defaultBranch main + git checkout -b main +else + log DEBUG ".git directory already present" +fi + +# Ensure at least one commit exists +prepare_commit + +# Set or update remote +if [ "$HTTP_STATUS" -eq 200 ]; then + setup_remote +else + log WARNING "Skipping remote setup – repository does not exist." +fi + +# Push to remote +if [ "$HTTP_STATUS" -eq 200 ]; then + push_changes +else + log WARNING "Skipping push – repository does not exist." +fi + diff --git a/import_embeddings.py b/import_embeddings.py new file mode 100644 index 0000000..0c345e2 --- /dev/null +++ b/import_embeddings.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Script Version: 0.2 +# Description: Import existing embeddings.json into ChromaDB using the new client API + +import os +import json +from chromadb import PersistentClient + +CHROMA_DIR = "chromadb" +COLLECTION_NAME = "cds_docs" +EMBEDDING_FILE = "embeddings.json" +CONTENT_DIR = "content" + +# New Chroma client (post-migration) +client = PersistentClient(path=CHROMA_DIR) +collection = client.get_or_create_collection(name=COLLECTION_NAME) + +# Load existing embeddings +with open(EMBEDDING_FILE, "r") as f: + embeddings_data = json.load(f) + +# Ingest each document +for filename, vector in embeddings_data.items(): + filepath = os.path.join(CONTENT_DIR, filename) + try: + with open(filepath, "r", encoding="utf-8") as f: + text = f.read().strip() + collection.add( + documents=[text], + metadatas=[{"filename": filename}], + ids=[filename], + embeddings=[vector] + ) + except FileNotFoundError: + print(f"[WARN] Skipping missing file: {filepath}") + +print("āœ… Embeddings successfully imported into Chroma") + diff --git a/semantic_search.py b/semantic_search.py new file mode 100644 index 0000000..fee9446 --- /dev/null +++ b/semantic_search.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# Script Version: 0.4 +# Description: Semantic search over local embeddings.json with content preview and optional file copy + +import json +import torch +import os +import shutil +import numpy as np +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity + +# Config +EMBEDDING_FILE = "embeddings.json" +CONTENT_DIR = "content" +RESULTS_DIR = "results" +MODEL_NAME = "all-mpnet-base-v2" +PREVIEW_LINES = 5 # Number of lines to preview from the matching .txt files + +# Ensure results directory exists +os.makedirs(RESULTS_DIR, exist_ok=True) + +# Load model +model = SentenceTransformer(MODEL_NAME) +if torch.cuda.is_available(): + model = model.to("cuda") + print("[INFO] Running on GPU") + +# Load stored embeddings +with open(EMBEDDING_FILE, "r") as f: + stored_embeddings = json.load(f) + +# Prompt user +query = input("\U0001F50D Enter your search query: ").strip() + +# Embed query +query_embedding = model.encode(query) + +# Compute cosine similarities +results = [] +for filename, embedding in stored_embeddings.items(): + score = cosine_similarity([query_embedding], [embedding])[0][0] + results.append((filename, score)) + +# Sort and display top result(s) +results.sort(key=lambda x: x[1], reverse=True) + +copied_files = [] + +print("\n\U0001F4C2 Top matches:") +for fname, score in results[:3]: + print(f"\n{fname} → score: {score:.4f}") + txt_path = os.path.join(CONTENT_DIR, fname) + if os.path.exists(txt_path): + print("Preview:") + with open(txt_path, "r", encoding="utf-8") as f: + for i, line in enumerate(f): + print(" " + line.strip()) + if i + 1 >= PREVIEW_LINES: + break + + # Ask user if they want to copy the file + should_copy = input(f"šŸ“„ Copy '{fname}' to '{RESULTS_DIR}'? [y/N]: ").strip().lower() + if should_copy == "y": + dest_path = os.path.join(RESULTS_DIR, fname) + shutil.copyfile(txt_path, dest_path) + copied_files.append(fname) + print(f"[INFO] File copied to {dest_path}") + else: + print("[WARN] Source file not found for preview.") + +# Final summary +if copied_files: + print("\nāœ… Summary of copied files:") + for f in copied_files: + print(f" - {f}") +else: + print("\nā„¹ļø No files were copied.") +