Update 2025-04-13_16:05:19

This commit is contained in:
oib
2025-04-13 16:05:19 +02:00
commit a10ca4fcef
4 changed files with 376 additions and 0 deletions

40
generate_embeddings.py Normal file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
# Script Version: 01
# Description: Generate embeddings from text files using GPU (if available)
import os
import torch
import json
from sentence_transformers import SentenceTransformer
# Set variables
# ========
CONTENT_DIR = "content"
OUTPUT_FILE = "embeddings.json"
# Initialize the embedding model and move to GPU if available
model = SentenceTransformer("all-mpnet-base-v2")
if torch.cuda.is_available():
model = model.to("cuda")
print("[INFO] GPU detected: Model running on GPU")
else:
print("[INFO] No GPU detected: Model running on CPU")
# Generate embeddings
# ========
embedding_data = {}
for filename in os.listdir(CONTENT_DIR):
if filename.endswith(".txt"):
filepath = os.path.join(CONTENT_DIR, filename)
with open(filepath, "r", encoding="utf-8") as file:
text = file.read().strip()
embedding = model.encode(text)
embedding_data[filename] = embedding.tolist()
# Save embeddings to JSON
with open(OUTPUT_FILE, "w") as json_file:
json.dump(embedding_data, json_file, indent=4)
print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")

219
gitea_push.sh Executable file
View File

@ -0,0 +1,219 @@
#!/bin/zsh
# Script Version: 1.5
# Description: Pushes the current folder (e.g. /etc) to a nested Gitea repo using provided nesting arguments. Auto-creates the remote repo via Gitea API if missing.
# Set variables
# ========
# Try to extract GITEA_API_TOKEN from ~/.gitea_token if not set
if [ -z "$GITEA_API_TOKEN" ] && [ -f "$HOME/.gitea_token" ]; then
GITEA_API_TOKEN=$(<"$HOME/.gitea_token")
export GITEA_API_TOKEN
fi
GITEA_USER=$(awk '{for(i=1;i<=NF;i++) if($i=="login") print $(i+1)}' ~/.netrc | head -n1)
if [ -z "$GITEA_USER" ]; then
echo "[ERROR] No login found in ~/.netrc"
exit 1
fi
GITEA_URL="https://$(awk '{for(i=1;i<=NF;i++) if($i=="machine") print $(i+1)}' ~/.netrc | head -n1)"
if [ -z "$GITEA_URL" ]; then
echo "[ERROR] No URL found in ~/.netrc"
exit 1
fi
GITEA_API_URL="$GITEA_URL/api/v1"
PRIVATE=false
DEBUG=false
COMMIT_MESSAGE="Update $(date +"%F_%T")"
# Logging function
# ========
log() {
local level="$1"; shift
if [[ "$level" == "DEBUG" && "$DEBUG" != true ]]; then return; fi
local color_reset="$(tput sgr0)"
local color=""
case "$level" in
INFO) color="$(tput setaf 2)" ;; # green
WARNING) color="$(tput setaf 3)" ;; # yellow
ERROR) color="$(tput setaf 1)" ;; # red
DEBUG) color="$(tput setaf 4)" ;; # blue
esac
echo "${color}[$level] $*${color_reset}"
}
# Functions
# ========
create_repo() {
log INFO "Repository does not exist. Creating via API: $REMOTE_PATH"
log DEBUG "POST $GITEA_API_URL/user/repos with name=$REMOTE_PATH and private=$PRIVATE"
RESPONSE=$(curl -s -X POST \
-H "Authorization: token $GITEA_API_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"name\": \"$FOLDER_NAME\", \"private\": $PRIVATE}" \
"$GITEA_API_URL/user/repos")
if echo "$RESPONSE" | grep -q '"clone_url"'; then
log INFO "Remote repository created successfully."
HTTP_STATUS=200
else
log ERROR "Failed to create remote repository: $RESPONSE"
exit 1
fi
}
prepare_commit() {
git add .
if git diff --quiet HEAD && ! git rev-parse --verify HEAD >/dev/null 2>&1; then
log INFO "Creating initial commit"
git commit -m "$COMMIT_MESSAGE"
elif ! git diff --quiet HEAD; then
log INFO "Committing changes"
git commit -m "$COMMIT_MESSAGE"
else
log INFO "Nothing to commit"
fi
}
setup_remote() {
if git remote | grep -q '^origin$'; then
log INFO "Updating remote origin URL"
git remote set-url origin "$GIT_REMOTE"
else
log INFO "Adding remote origin"
git remote add origin "$GIT_REMOTE"
fi
}
push_changes() {
log INFO "Pushing to $GIT_REMOTE"
git push -u origin main
}
# Show help if no arguments are given
# ========
if [ $# -eq 0 ]; then
echo "GITEA_API_TOKEN=<your token>"
echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] <host_group>"
echo "Example: $0 server"
echo " $0 --private workstation"
echo " $0 --debug server"
echo " $0 --message \"minor update\" server"
echo
echo "Note: You must cd into the target folder before running this script."
echo "For example:"
echo " cd /etc && $0 server"
echo
echo "Authentication:"
echo " Git operations (clone, push, pull) use ~/.netrc with your Git password:"
echo " machine \$(echo \"$GITEA_URL\" | sed 's|https\?://||') login $GITEA_USER password \"<your Git password>\""
echo " chmod 600 ~/.netrc"
echo
echo " API operations (e.g. creating repos) use a Personal Access Token stored in ~/.gitea_token"
echo " echo \"<your_token>\" > ~/.gitea_token && chmod 600 ~/.gitea_token"
exit 0
fi
# Parse arguments
# ========
POSITIONAL_ARGS=()
while [[ $# -gt 0 ]]; do
case "$1" in
--private)
PRIVATE=true
shift
;;
--debug)
DEBUG=true
shift
;;
--message)
COMMIT_MESSAGE="$2"
shift 2
;;
*)
POSITIONAL_ARGS+=("$1")
shift
;;
esac
done
set -- "${POSITIONAL_ARGS[@]}"
if [[ $# -ne 1 ]]; then
echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] <host_group>"
exit 1
fi
HOST_GROUP=$(echo "$1" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')
HOST_NAME=$(hostname -s | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')
FOLDER_NAME="${HOST_NAME}-${HOST_GROUP}-$(basename "$PWD")"
REPO_PATH="$PWD"
REMOTE_PATH="$FOLDER_NAME"
GIT_REMOTE="$GITEA_URL/$GITEA_USER/$FOLDER_NAME.git"
# Git authentication hint
log DEBUG "Ensure ~/.netrc has: machine <host> login $GITEA_USER password <your Git password>"
# Check or create remote repo
check_or_create_repo() {
if [ -z "$GITEA_API_TOKEN" ]; then
log WARNING "GITEA_API_TOKEN is not set. Skipping API repo creation."
return
fi
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-H "Authorization: token $GITEA_API_TOKEN" \
"$GITEA_API_URL/repos/$GITEA_USER/$FOLDER_NAME")
if [ "$HTTP_STATUS" -ne 200 ]; then
create_repo
else
log INFO "Remote repository already exists."
fi
}
check_or_create_repo
# Main Process
# ========
# Safety check against pushing from / or $HOME
if [[ "$PWD" == "$HOME" || "$PWD" == "/" ]]; then
log ERROR "Refusing to run inside \$PWD=$PWD"
exit 1
fi
log INFO "Pushing $REPO_PATH to $GIT_REMOTE"
cd "$REPO_PATH" || { log ERROR "Directory $REPO_PATH not found"; exit 1; }
# Initialize git if needed
# Branch is fixed to 'main' for simplicity and consistency
if [ ! -d .git ]; then
log INFO "Initializing Git repo"
git init
git config init.defaultBranch main
git checkout -b main
else
log DEBUG ".git directory already present"
fi
# Ensure at least one commit exists
prepare_commit
# Set or update remote
if [ "$HTTP_STATUS" -eq 200 ]; then
setup_remote
else
log WARNING "Skipping remote setup repository does not exist."
fi
# Push to remote
if [ "$HTTP_STATUS" -eq 200 ]; then
push_changes
else
log WARNING "Skipping push repository does not exist."
fi

38
import_embeddings.py Normal file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python3
# Script Version: 0.2
# Description: Import existing embeddings.json into ChromaDB using the new client API
import os
import json
from chromadb import PersistentClient
CHROMA_DIR = "chromadb"
COLLECTION_NAME = "cds_docs"
EMBEDDING_FILE = "embeddings.json"
CONTENT_DIR = "content"
# New Chroma client (post-migration)
client = PersistentClient(path=CHROMA_DIR)
collection = client.get_or_create_collection(name=COLLECTION_NAME)
# Load existing embeddings
with open(EMBEDDING_FILE, "r") as f:
embeddings_data = json.load(f)
# Ingest each document
for filename, vector in embeddings_data.items():
filepath = os.path.join(CONTENT_DIR, filename)
try:
with open(filepath, "r", encoding="utf-8") as f:
text = f.read().strip()
collection.add(
documents=[text],
metadatas=[{"filename": filename}],
ids=[filename],
embeddings=[vector]
)
except FileNotFoundError:
print(f"[WARN] Skipping missing file: {filepath}")
print("✅ Embeddings successfully imported into Chroma")

79
semantic_search.py Normal file
View File

@ -0,0 +1,79 @@
#!/usr/bin/env python3
# Script Version: 0.4
# Description: Semantic search over local embeddings.json with content preview and optional file copy
import json
import torch
import os
import shutil
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Config
EMBEDDING_FILE = "embeddings.json"
CONTENT_DIR = "content"
RESULTS_DIR = "results"
MODEL_NAME = "all-mpnet-base-v2"
PREVIEW_LINES = 5 # Number of lines to preview from the matching .txt files
# Ensure results directory exists
os.makedirs(RESULTS_DIR, exist_ok=True)
# Load model
model = SentenceTransformer(MODEL_NAME)
if torch.cuda.is_available():
model = model.to("cuda")
print("[INFO] Running on GPU")
# Load stored embeddings
with open(EMBEDDING_FILE, "r") as f:
stored_embeddings = json.load(f)
# Prompt user
query = input("\U0001F50D Enter your search query: ").strip()
# Embed query
query_embedding = model.encode(query)
# Compute cosine similarities
results = []
for filename, embedding in stored_embeddings.items():
score = cosine_similarity([query_embedding], [embedding])[0][0]
results.append((filename, score))
# Sort and display top result(s)
results.sort(key=lambda x: x[1], reverse=True)
copied_files = []
print("\n\U0001F4C2 Top matches:")
for fname, score in results[:3]:
print(f"\n{fname} → score: {score:.4f}")
txt_path = os.path.join(CONTENT_DIR, fname)
if os.path.exists(txt_path):
print("Preview:")
with open(txt_path, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
print(" " + line.strip())
if i + 1 >= PREVIEW_LINES:
break
# Ask user if they want to copy the file
should_copy = input(f"📄 Copy '{fname}' to '{RESULTS_DIR}'? [y/N]: ").strip().lower()
if should_copy == "y":
dest_path = os.path.join(RESULTS_DIR, fname)
shutil.copyfile(txt_path, dest_path)
copied_files.append(fname)
print(f"[INFO] File copied to {dest_path}")
else:
print("[WARN] Source file not found for preview.")
# Final summary
if copied_files:
print("\n✅ Summary of copied files:")
for f in copied_files:
print(f" - {f}")
else:
print("\n No files were copied.")