Update 2025-04-13_16:05:19
This commit is contained in:
40
generate_embeddings.py
Normal file
40
generate_embeddings.py
Normal file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
# Script Version: 01
|
||||
# Description: Generate embeddings from text files using GPU (if available)
|
||||
|
||||
import os
|
||||
import torch
|
||||
import json
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Set variables
|
||||
# ========
|
||||
CONTENT_DIR = "content"
|
||||
OUTPUT_FILE = "embeddings.json"
|
||||
|
||||
# Initialize the embedding model and move to GPU if available
|
||||
model = SentenceTransformer("all-mpnet-base-v2")
|
||||
if torch.cuda.is_available():
|
||||
model = model.to("cuda")
|
||||
print("[INFO] GPU detected: Model running on GPU")
|
||||
else:
|
||||
print("[INFO] No GPU detected: Model running on CPU")
|
||||
|
||||
# Generate embeddings
|
||||
# ========
|
||||
embedding_data = {}
|
||||
|
||||
for filename in os.listdir(CONTENT_DIR):
|
||||
if filename.endswith(".txt"):
|
||||
filepath = os.path.join(CONTENT_DIR, filename)
|
||||
with open(filepath, "r", encoding="utf-8") as file:
|
||||
text = file.read().strip()
|
||||
embedding = model.encode(text)
|
||||
embedding_data[filename] = embedding.tolist()
|
||||
|
||||
# Save embeddings to JSON
|
||||
with open(OUTPUT_FILE, "w") as json_file:
|
||||
json.dump(embedding_data, json_file, indent=4)
|
||||
|
||||
print(f"[INFO] Embeddings successfully saved to {OUTPUT_FILE}")
|
||||
|
219
gitea_push.sh
Executable file
219
gitea_push.sh
Executable file
@ -0,0 +1,219 @@
|
||||
#!/bin/zsh
|
||||
# Script Version: 1.5
|
||||
# Description: Pushes the current folder (e.g. /etc) to a nested Gitea repo using provided nesting arguments. Auto-creates the remote repo via Gitea API if missing.
|
||||
|
||||
# Set variables
|
||||
# ========
|
||||
|
||||
# Try to extract GITEA_API_TOKEN from ~/.gitea_token if not set
|
||||
if [ -z "$GITEA_API_TOKEN" ] && [ -f "$HOME/.gitea_token" ]; then
|
||||
GITEA_API_TOKEN=$(<"$HOME/.gitea_token")
|
||||
export GITEA_API_TOKEN
|
||||
fi
|
||||
|
||||
GITEA_USER=$(awk '{for(i=1;i<=NF;i++) if($i=="login") print $(i+1)}' ~/.netrc | head -n1)
|
||||
if [ -z "$GITEA_USER" ]; then
|
||||
echo "[ERROR] No login found in ~/.netrc"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GITEA_URL="https://$(awk '{for(i=1;i<=NF;i++) if($i=="machine") print $(i+1)}' ~/.netrc | head -n1)"
|
||||
if [ -z "$GITEA_URL" ]; then
|
||||
echo "[ERROR] No URL found in ~/.netrc"
|
||||
exit 1
|
||||
fi
|
||||
GITEA_API_URL="$GITEA_URL/api/v1"
|
||||
|
||||
PRIVATE=false
|
||||
DEBUG=false
|
||||
COMMIT_MESSAGE="Update $(date +"%F_%T")"
|
||||
|
||||
# Logging function
|
||||
# ========
|
||||
log() {
|
||||
local level="$1"; shift
|
||||
if [[ "$level" == "DEBUG" && "$DEBUG" != true ]]; then return; fi
|
||||
local color_reset="$(tput sgr0)"
|
||||
local color=""
|
||||
case "$level" in
|
||||
INFO) color="$(tput setaf 2)" ;; # green
|
||||
WARNING) color="$(tput setaf 3)" ;; # yellow
|
||||
ERROR) color="$(tput setaf 1)" ;; # red
|
||||
DEBUG) color="$(tput setaf 4)" ;; # blue
|
||||
esac
|
||||
echo "${color}[$level] $*${color_reset}"
|
||||
}
|
||||
|
||||
# Functions
|
||||
# ========
|
||||
create_repo() {
|
||||
log INFO "Repository does not exist. Creating via API: $REMOTE_PATH"
|
||||
log DEBUG "POST $GITEA_API_URL/user/repos with name=$REMOTE_PATH and private=$PRIVATE"
|
||||
RESPONSE=$(curl -s -X POST \
|
||||
-H "Authorization: token $GITEA_API_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\": \"$FOLDER_NAME\", \"private\": $PRIVATE}" \
|
||||
"$GITEA_API_URL/user/repos")
|
||||
|
||||
if echo "$RESPONSE" | grep -q '"clone_url"'; then
|
||||
log INFO "Remote repository created successfully."
|
||||
HTTP_STATUS=200
|
||||
else
|
||||
log ERROR "Failed to create remote repository: $RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
prepare_commit() {
|
||||
git add .
|
||||
if git diff --quiet HEAD && ! git rev-parse --verify HEAD >/dev/null 2>&1; then
|
||||
log INFO "Creating initial commit"
|
||||
git commit -m "$COMMIT_MESSAGE"
|
||||
elif ! git diff --quiet HEAD; then
|
||||
log INFO "Committing changes"
|
||||
git commit -m "$COMMIT_MESSAGE"
|
||||
else
|
||||
log INFO "Nothing to commit"
|
||||
fi
|
||||
}
|
||||
|
||||
setup_remote() {
|
||||
if git remote | grep -q '^origin$'; then
|
||||
log INFO "Updating remote origin URL"
|
||||
git remote set-url origin "$GIT_REMOTE"
|
||||
else
|
||||
log INFO "Adding remote origin"
|
||||
git remote add origin "$GIT_REMOTE"
|
||||
fi
|
||||
}
|
||||
|
||||
push_changes() {
|
||||
log INFO "Pushing to $GIT_REMOTE"
|
||||
git push -u origin main
|
||||
}
|
||||
|
||||
# Show help if no arguments are given
|
||||
# ========
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "GITEA_API_TOKEN=<your token>"
|
||||
echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] <host_group>"
|
||||
echo "Example: $0 server"
|
||||
echo " $0 --private workstation"
|
||||
echo " $0 --debug server"
|
||||
echo " $0 --message \"minor update\" server"
|
||||
echo
|
||||
echo "Note: You must cd into the target folder before running this script."
|
||||
echo "For example:"
|
||||
echo " cd /etc && $0 server"
|
||||
echo
|
||||
echo "Authentication:"
|
||||
echo " Git operations (clone, push, pull) use ~/.netrc with your Git password:"
|
||||
echo " machine \$(echo \"$GITEA_URL\" | sed 's|https\?://||') login $GITEA_USER password \"<your Git password>\""
|
||||
echo " chmod 600 ~/.netrc"
|
||||
echo
|
||||
echo " API operations (e.g. creating repos) use a Personal Access Token stored in ~/.gitea_token"
|
||||
echo " echo \"<your_token>\" > ~/.gitea_token && chmod 600 ~/.gitea_token"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Parse arguments
|
||||
# ========
|
||||
POSITIONAL_ARGS=()
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--private)
|
||||
PRIVATE=true
|
||||
shift
|
||||
;;
|
||||
--debug)
|
||||
DEBUG=true
|
||||
shift
|
||||
;;
|
||||
--message)
|
||||
COMMIT_MESSAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
POSITIONAL_ARGS+=("$1")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
set -- "${POSITIONAL_ARGS[@]}"
|
||||
|
||||
if [[ $# -ne 1 ]]; then
|
||||
echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] <host_group>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
HOST_GROUP=$(echo "$1" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')
|
||||
HOST_NAME=$(hostname -s | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')
|
||||
FOLDER_NAME="${HOST_NAME}-${HOST_GROUP}-$(basename "$PWD")"
|
||||
REPO_PATH="$PWD"
|
||||
REMOTE_PATH="$FOLDER_NAME"
|
||||
GIT_REMOTE="$GITEA_URL/$GITEA_USER/$FOLDER_NAME.git"
|
||||
|
||||
# Git authentication hint
|
||||
log DEBUG "Ensure ~/.netrc has: machine <host> login $GITEA_USER password <your Git password>"
|
||||
|
||||
# Check or create remote repo
|
||||
check_or_create_repo() {
|
||||
if [ -z "$GITEA_API_TOKEN" ]; then
|
||||
log WARNING "GITEA_API_TOKEN is not set. Skipping API repo creation."
|
||||
return
|
||||
fi
|
||||
|
||||
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
-H "Authorization: token $GITEA_API_TOKEN" \
|
||||
"$GITEA_API_URL/repos/$GITEA_USER/$FOLDER_NAME")
|
||||
|
||||
if [ "$HTTP_STATUS" -ne 200 ]; then
|
||||
create_repo
|
||||
else
|
||||
log INFO "Remote repository already exists."
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
check_or_create_repo
|
||||
|
||||
# Main Process
|
||||
# ========
|
||||
|
||||
# Safety check against pushing from / or $HOME
|
||||
if [[ "$PWD" == "$HOME" || "$PWD" == "/" ]]; then
|
||||
log ERROR "Refusing to run inside \$PWD=$PWD"
|
||||
exit 1
|
||||
fi
|
||||
log INFO "Pushing $REPO_PATH to $GIT_REMOTE"
|
||||
cd "$REPO_PATH" || { log ERROR "Directory $REPO_PATH not found"; exit 1; }
|
||||
|
||||
# Initialize git if needed
|
||||
# Branch is fixed to 'main' for simplicity and consistency
|
||||
if [ ! -d .git ]; then
|
||||
log INFO "Initializing Git repo"
|
||||
git init
|
||||
git config init.defaultBranch main
|
||||
git checkout -b main
|
||||
else
|
||||
log DEBUG ".git directory already present"
|
||||
fi
|
||||
|
||||
# Ensure at least one commit exists
|
||||
prepare_commit
|
||||
|
||||
# Set or update remote
|
||||
if [ "$HTTP_STATUS" -eq 200 ]; then
|
||||
setup_remote
|
||||
else
|
||||
log WARNING "Skipping remote setup – repository does not exist."
|
||||
fi
|
||||
|
||||
# Push to remote
|
||||
if [ "$HTTP_STATUS" -eq 200 ]; then
|
||||
push_changes
|
||||
else
|
||||
log WARNING "Skipping push – repository does not exist."
|
||||
fi
|
||||
|
38
import_embeddings.py
Normal file
38
import_embeddings.py
Normal file
@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python3
|
||||
# Script Version: 0.2
|
||||
# Description: Import existing embeddings.json into ChromaDB using the new client API
|
||||
|
||||
import os
|
||||
import json
|
||||
from chromadb import PersistentClient
|
||||
|
||||
CHROMA_DIR = "chromadb"
|
||||
COLLECTION_NAME = "cds_docs"
|
||||
EMBEDDING_FILE = "embeddings.json"
|
||||
CONTENT_DIR = "content"
|
||||
|
||||
# New Chroma client (post-migration)
|
||||
client = PersistentClient(path=CHROMA_DIR)
|
||||
collection = client.get_or_create_collection(name=COLLECTION_NAME)
|
||||
|
||||
# Load existing embeddings
|
||||
with open(EMBEDDING_FILE, "r") as f:
|
||||
embeddings_data = json.load(f)
|
||||
|
||||
# Ingest each document
|
||||
for filename, vector in embeddings_data.items():
|
||||
filepath = os.path.join(CONTENT_DIR, filename)
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
text = f.read().strip()
|
||||
collection.add(
|
||||
documents=[text],
|
||||
metadatas=[{"filename": filename}],
|
||||
ids=[filename],
|
||||
embeddings=[vector]
|
||||
)
|
||||
except FileNotFoundError:
|
||||
print(f"[WARN] Skipping missing file: {filepath}")
|
||||
|
||||
print("✅ Embeddings successfully imported into Chroma")
|
||||
|
79
semantic_search.py
Normal file
79
semantic_search.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
# Script Version: 0.4
|
||||
# Description: Semantic search over local embeddings.json with content preview and optional file copy
|
||||
|
||||
import json
|
||||
import torch
|
||||
import os
|
||||
import shutil
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# Config
|
||||
EMBEDDING_FILE = "embeddings.json"
|
||||
CONTENT_DIR = "content"
|
||||
RESULTS_DIR = "results"
|
||||
MODEL_NAME = "all-mpnet-base-v2"
|
||||
PREVIEW_LINES = 5 # Number of lines to preview from the matching .txt files
|
||||
|
||||
# Ensure results directory exists
|
||||
os.makedirs(RESULTS_DIR, exist_ok=True)
|
||||
|
||||
# Load model
|
||||
model = SentenceTransformer(MODEL_NAME)
|
||||
if torch.cuda.is_available():
|
||||
model = model.to("cuda")
|
||||
print("[INFO] Running on GPU")
|
||||
|
||||
# Load stored embeddings
|
||||
with open(EMBEDDING_FILE, "r") as f:
|
||||
stored_embeddings = json.load(f)
|
||||
|
||||
# Prompt user
|
||||
query = input("\U0001F50D Enter your search query: ").strip()
|
||||
|
||||
# Embed query
|
||||
query_embedding = model.encode(query)
|
||||
|
||||
# Compute cosine similarities
|
||||
results = []
|
||||
for filename, embedding in stored_embeddings.items():
|
||||
score = cosine_similarity([query_embedding], [embedding])[0][0]
|
||||
results.append((filename, score))
|
||||
|
||||
# Sort and display top result(s)
|
||||
results.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
copied_files = []
|
||||
|
||||
print("\n\U0001F4C2 Top matches:")
|
||||
for fname, score in results[:3]:
|
||||
print(f"\n{fname} → score: {score:.4f}")
|
||||
txt_path = os.path.join(CONTENT_DIR, fname)
|
||||
if os.path.exists(txt_path):
|
||||
print("Preview:")
|
||||
with open(txt_path, "r", encoding="utf-8") as f:
|
||||
for i, line in enumerate(f):
|
||||
print(" " + line.strip())
|
||||
if i + 1 >= PREVIEW_LINES:
|
||||
break
|
||||
|
||||
# Ask user if they want to copy the file
|
||||
should_copy = input(f"📄 Copy '{fname}' to '{RESULTS_DIR}'? [y/N]: ").strip().lower()
|
||||
if should_copy == "y":
|
||||
dest_path = os.path.join(RESULTS_DIR, fname)
|
||||
shutil.copyfile(txt_path, dest_path)
|
||||
copied_files.append(fname)
|
||||
print(f"[INFO] File copied to {dest_path}")
|
||||
else:
|
||||
print("[WARN] Source file not found for preview.")
|
||||
|
||||
# Final summary
|
||||
if copied_files:
|
||||
print("\n✅ Summary of copied files:")
|
||||
for f in copied_files:
|
||||
print(f" - {f}")
|
||||
else:
|
||||
print("\nℹ️ No files were copied.")
|
||||
|
Reference in New Issue
Block a user