Update 2025-04-13_16:05:19
This commit is contained in:
40
generate_embeddings.py
Normal file
40
generate_embeddings.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Script Version: 01
|
||||||
|
# Description: Generate embeddings from text files using GPU (if available)
|
||||||
|
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import json
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
# Set variables
|
||||||
|
# ========
|
||||||
|
CONTENT_DIR = "content"
|
||||||
|
OUTPUT_FILE = "embeddings.json"
|
||||||
|
|
||||||
|
# Initialize the embedding model and move to GPU if available
|
||||||
|
model = SentenceTransformer("all-mpnet-base-v2")
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
model = model.to("cuda")
|
||||||
|
print("[INFO] GPU detected: Model running on GPU")
|
||||||
|
else:
|
||||||
|
print("[INFO] No GPU detected: Model running on CPU")
|
||||||
|
|
||||||
|
# Generate embeddings
|
||||||
|
# ========
|
||||||
|
embedding_data = {}
|
||||||
|
|
||||||
|
for filename in os.listdir(CONTENT_DIR):
|
||||||
|
if filename.endswith(".txt"):
|
||||||
|
filepath = os.path.join(CONTENT_DIR, filename)
|
||||||
|
with open(filepath, "r", encoding="utf-8") as file:
|
||||||
|
text = file.read().strip()
|
||||||
|
embedding = model.encode(text)
|
||||||
|
embedding_data[filename] = embedding.tolist()
|
||||||
|
|
||||||
|
# Save embeddings to JSON
|
||||||
|
with open(OUTPUT_FILE, "w") as json_file:
|
||||||
|
json.dump(embedding_data, json_file, indent=4)
|
||||||
|
|
||||||
|
print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")
|
||||||
|
|
219
gitea_push.sh
Executable file
219
gitea_push.sh
Executable file
@ -0,0 +1,219 @@
|
|||||||
|
#!/bin/zsh
|
||||||
|
# Script Version: 1.5
|
||||||
|
# Description: Pushes the current folder (e.g. /etc) to a nested Gitea repo using provided nesting arguments. Auto-creates the remote repo via Gitea API if missing.
|
||||||
|
|
||||||
|
# Set variables
|
||||||
|
# ========
|
||||||
|
|
||||||
|
# Try to extract GITEA_API_TOKEN from ~/.gitea_token if not set
|
||||||
|
if [ -z "$GITEA_API_TOKEN" ] && [ -f "$HOME/.gitea_token" ]; then
|
||||||
|
GITEA_API_TOKEN=$(<"$HOME/.gitea_token")
|
||||||
|
export GITEA_API_TOKEN
|
||||||
|
fi
|
||||||
|
|
||||||
|
GITEA_USER=$(awk '{for(i=1;i<=NF;i++) if($i=="login") print $(i+1)}' ~/.netrc | head -n1)
|
||||||
|
if [ -z "$GITEA_USER" ]; then
|
||||||
|
echo "[ERROR] No login found in ~/.netrc"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
GITEA_URL="https://$(awk '{for(i=1;i<=NF;i++) if($i=="machine") print $(i+1)}' ~/.netrc | head -n1)"
|
||||||
|
if [ -z "$GITEA_URL" ]; then
|
||||||
|
echo "[ERROR] No URL found in ~/.netrc"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
GITEA_API_URL="$GITEA_URL/api/v1"
|
||||||
|
|
||||||
|
PRIVATE=false
|
||||||
|
DEBUG=false
|
||||||
|
COMMIT_MESSAGE="Update $(date +"%F_%T")"
|
||||||
|
|
||||||
|
# Logging function
|
||||||
|
# ========
|
||||||
|
log() {
|
||||||
|
local level="$1"; shift
|
||||||
|
if [[ "$level" == "DEBUG" && "$DEBUG" != true ]]; then return; fi
|
||||||
|
local color_reset="$(tput sgr0)"
|
||||||
|
local color=""
|
||||||
|
case "$level" in
|
||||||
|
INFO) color="$(tput setaf 2)" ;; # green
|
||||||
|
WARNING) color="$(tput setaf 3)" ;; # yellow
|
||||||
|
ERROR) color="$(tput setaf 1)" ;; # red
|
||||||
|
DEBUG) color="$(tput setaf 4)" ;; # blue
|
||||||
|
esac
|
||||||
|
echo "${color}[$level] $*${color_reset}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Functions
|
||||||
|
# ========
|
||||||
|
create_repo() {
|
||||||
|
log INFO "Repository does not exist. Creating via API: $REMOTE_PATH"
|
||||||
|
log DEBUG "POST $GITEA_API_URL/user/repos with name=$REMOTE_PATH and private=$PRIVATE"
|
||||||
|
RESPONSE=$(curl -s -X POST \
|
||||||
|
-H "Authorization: token $GITEA_API_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"name\": \"$FOLDER_NAME\", \"private\": $PRIVATE}" \
|
||||||
|
"$GITEA_API_URL/user/repos")
|
||||||
|
|
||||||
|
if echo "$RESPONSE" | grep -q '"clone_url"'; then
|
||||||
|
log INFO "Remote repository created successfully."
|
||||||
|
HTTP_STATUS=200
|
||||||
|
else
|
||||||
|
log ERROR "Failed to create remote repository: $RESPONSE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_commit() {
|
||||||
|
git add .
|
||||||
|
if git diff --quiet HEAD && ! git rev-parse --verify HEAD >/dev/null 2>&1; then
|
||||||
|
log INFO "Creating initial commit"
|
||||||
|
git commit -m "$COMMIT_MESSAGE"
|
||||||
|
elif ! git diff --quiet HEAD; then
|
||||||
|
log INFO "Committing changes"
|
||||||
|
git commit -m "$COMMIT_MESSAGE"
|
||||||
|
else
|
||||||
|
log INFO "Nothing to commit"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
setup_remote() {
|
||||||
|
if git remote | grep -q '^origin$'; then
|
||||||
|
log INFO "Updating remote origin URL"
|
||||||
|
git remote set-url origin "$GIT_REMOTE"
|
||||||
|
else
|
||||||
|
log INFO "Adding remote origin"
|
||||||
|
git remote add origin "$GIT_REMOTE"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
push_changes() {
|
||||||
|
log INFO "Pushing to $GIT_REMOTE"
|
||||||
|
git push -u origin main
|
||||||
|
}
|
||||||
|
|
||||||
|
# Show help if no arguments are given
|
||||||
|
# ========
|
||||||
|
if [ $# -eq 0 ]; then
|
||||||
|
echo "GITEA_API_TOKEN=<your token>"
|
||||||
|
echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] <host_group>"
|
||||||
|
echo "Example: $0 server"
|
||||||
|
echo " $0 --private workstation"
|
||||||
|
echo " $0 --debug server"
|
||||||
|
echo " $0 --message \"minor update\" server"
|
||||||
|
echo
|
||||||
|
echo "Note: You must cd into the target folder before running this script."
|
||||||
|
echo "For example:"
|
||||||
|
echo " cd /etc && $0 server"
|
||||||
|
echo
|
||||||
|
echo "Authentication:"
|
||||||
|
echo " Git operations (clone, push, pull) use ~/.netrc with your Git password:"
|
||||||
|
echo " machine \$(echo \"$GITEA_URL\" | sed 's|https\?://||') login $GITEA_USER password \"<your Git password>\""
|
||||||
|
echo " chmod 600 ~/.netrc"
|
||||||
|
echo
|
||||||
|
echo " API operations (e.g. creating repos) use a Personal Access Token stored in ~/.gitea_token"
|
||||||
|
echo " echo \"<your_token>\" > ~/.gitea_token && chmod 600 ~/.gitea_token"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
# ========
|
||||||
|
POSITIONAL_ARGS=()
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--private)
|
||||||
|
PRIVATE=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--debug)
|
||||||
|
DEBUG=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--message)
|
||||||
|
COMMIT_MESSAGE="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
POSITIONAL_ARGS+=("$1")
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
set -- "${POSITIONAL_ARGS[@]}"
|
||||||
|
|
||||||
|
if [[ $# -ne 1 ]]; then
|
||||||
|
echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] <host_group>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
HOST_GROUP=$(echo "$1" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')
|
||||||
|
HOST_NAME=$(hostname -s | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')
|
||||||
|
FOLDER_NAME="${HOST_NAME}-${HOST_GROUP}-$(basename "$PWD")"
|
||||||
|
REPO_PATH="$PWD"
|
||||||
|
REMOTE_PATH="$FOLDER_NAME"
|
||||||
|
GIT_REMOTE="$GITEA_URL/$GITEA_USER/$FOLDER_NAME.git"
|
||||||
|
|
||||||
|
# Git authentication hint
|
||||||
|
log DEBUG "Ensure ~/.netrc has: machine <host> login $GITEA_USER password <your Git password>"
|
||||||
|
|
||||||
|
# Check or create remote repo
|
||||||
|
check_or_create_repo() {
|
||||||
|
if [ -z "$GITEA_API_TOKEN" ]; then
|
||||||
|
log WARNING "GITEA_API_TOKEN is not set. Skipping API repo creation."
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||||
|
-H "Authorization: token $GITEA_API_TOKEN" \
|
||||||
|
"$GITEA_API_URL/repos/$GITEA_USER/$FOLDER_NAME")
|
||||||
|
|
||||||
|
if [ "$HTTP_STATUS" -ne 200 ]; then
|
||||||
|
create_repo
|
||||||
|
else
|
||||||
|
log INFO "Remote repository already exists."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
check_or_create_repo
|
||||||
|
|
||||||
|
# Main Process
|
||||||
|
# ========
|
||||||
|
|
||||||
|
# Safety check against pushing from / or $HOME
|
||||||
|
if [[ "$PWD" == "$HOME" || "$PWD" == "/" ]]; then
|
||||||
|
log ERROR "Refusing to run inside \$PWD=$PWD"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log INFO "Pushing $REPO_PATH to $GIT_REMOTE"
|
||||||
|
cd "$REPO_PATH" || { log ERROR "Directory $REPO_PATH not found"; exit 1; }
|
||||||
|
|
||||||
|
# Initialize git if needed
|
||||||
|
# Branch is fixed to 'main' for simplicity and consistency
|
||||||
|
if [ ! -d .git ]; then
|
||||||
|
log INFO "Initializing Git repo"
|
||||||
|
git init
|
||||||
|
git config init.defaultBranch main
|
||||||
|
git checkout -b main
|
||||||
|
else
|
||||||
|
log DEBUG ".git directory already present"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Ensure at least one commit exists
|
||||||
|
prepare_commit
|
||||||
|
|
||||||
|
# Set or update remote
|
||||||
|
if [ "$HTTP_STATUS" -eq 200 ]; then
|
||||||
|
setup_remote
|
||||||
|
else
|
||||||
|
log WARNING "Skipping remote setup – repository does not exist."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Push to remote
|
||||||
|
if [ "$HTTP_STATUS" -eq 200 ]; then
|
||||||
|
push_changes
|
||||||
|
else
|
||||||
|
log WARNING "Skipping push – repository does not exist."
|
||||||
|
fi
|
||||||
|
|
38
import_embeddings.py
Normal file
38
import_embeddings.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Script Version: 0.2
|
||||||
|
# Description: Import existing embeddings.json into ChromaDB using the new client API
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from chromadb import PersistentClient
|
||||||
|
|
||||||
|
CHROMA_DIR = "chromadb"
|
||||||
|
COLLECTION_NAME = "cds_docs"
|
||||||
|
EMBEDDING_FILE = "embeddings.json"
|
||||||
|
CONTENT_DIR = "content"
|
||||||
|
|
||||||
|
# New Chroma client (post-migration)
|
||||||
|
client = PersistentClient(path=CHROMA_DIR)
|
||||||
|
collection = client.get_or_create_collection(name=COLLECTION_NAME)
|
||||||
|
|
||||||
|
# Load existing embeddings
|
||||||
|
with open(EMBEDDING_FILE, "r") as f:
|
||||||
|
embeddings_data = json.load(f)
|
||||||
|
|
||||||
|
# Ingest each document
|
||||||
|
for filename, vector in embeddings_data.items():
|
||||||
|
filepath = os.path.join(CONTENT_DIR, filename)
|
||||||
|
try:
|
||||||
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
|
text = f.read().strip()
|
||||||
|
collection.add(
|
||||||
|
documents=[text],
|
||||||
|
metadatas=[{"filename": filename}],
|
||||||
|
ids=[filename],
|
||||||
|
embeddings=[vector]
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"[WARN] Skipping missing file: {filepath}")
|
||||||
|
|
||||||
|
print("✅ Embeddings successfully imported into Chroma")
|
||||||
|
|
79
semantic_search.py
Normal file
79
semantic_search.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Script Version: 0.4
|
||||||
|
# Description: Semantic search over local embeddings.json with content preview and optional file copy
|
||||||
|
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import numpy as np
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
# Config
|
||||||
|
EMBEDDING_FILE = "embeddings.json"
|
||||||
|
CONTENT_DIR = "content"
|
||||||
|
RESULTS_DIR = "results"
|
||||||
|
MODEL_NAME = "all-mpnet-base-v2"
|
||||||
|
PREVIEW_LINES = 5 # Number of lines to preview from the matching .txt files
|
||||||
|
|
||||||
|
# Ensure results directory exists
|
||||||
|
os.makedirs(RESULTS_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# Load model
|
||||||
|
model = SentenceTransformer(MODEL_NAME)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
model = model.to("cuda")
|
||||||
|
print("[INFO] Running on GPU")
|
||||||
|
|
||||||
|
# Load stored embeddings
|
||||||
|
with open(EMBEDDING_FILE, "r") as f:
|
||||||
|
stored_embeddings = json.load(f)
|
||||||
|
|
||||||
|
# Prompt user
|
||||||
|
query = input("\U0001F50D Enter your search query: ").strip()
|
||||||
|
|
||||||
|
# Embed query
|
||||||
|
query_embedding = model.encode(query)
|
||||||
|
|
||||||
|
# Compute cosine similarities
|
||||||
|
results = []
|
||||||
|
for filename, embedding in stored_embeddings.items():
|
||||||
|
score = cosine_similarity([query_embedding], [embedding])[0][0]
|
||||||
|
results.append((filename, score))
|
||||||
|
|
||||||
|
# Sort and display top result(s)
|
||||||
|
results.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
copied_files = []
|
||||||
|
|
||||||
|
print("\n\U0001F4C2 Top matches:")
|
||||||
|
for fname, score in results[:3]:
|
||||||
|
print(f"\n{fname} → score: {score:.4f}")
|
||||||
|
txt_path = os.path.join(CONTENT_DIR, fname)
|
||||||
|
if os.path.exists(txt_path):
|
||||||
|
print("Preview:")
|
||||||
|
with open(txt_path, "r", encoding="utf-8") as f:
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
print(" " + line.strip())
|
||||||
|
if i + 1 >= PREVIEW_LINES:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Ask user if they want to copy the file
|
||||||
|
should_copy = input(f"📄 Copy '{fname}' to '{RESULTS_DIR}'? [y/N]: ").strip().lower()
|
||||||
|
if should_copy == "y":
|
||||||
|
dest_path = os.path.join(RESULTS_DIR, fname)
|
||||||
|
shutil.copyfile(txt_path, dest_path)
|
||||||
|
copied_files.append(fname)
|
||||||
|
print(f"[INFO] File copied to {dest_path}")
|
||||||
|
else:
|
||||||
|
print("[WARN] Source file not found for preview.")
|
||||||
|
|
||||||
|
# Final summary
|
||||||
|
if copied_files:
|
||||||
|
print("\n✅ Summary of copied files:")
|
||||||
|
for f in copied_files:
|
||||||
|
print(f" - {f}")
|
||||||
|
else:
|
||||||
|
print("\nℹ️ No files were copied.")
|
||||||
|
|
Reference in New Issue
Block a user