commit 0a1a209dac7325f057e4621bd1203cfc753352a3 Author: bubuIT Date: Sun Apr 13 15:16:39 2025 +0200 Update 2025-04-13_15:16:39 diff --git a/Fine-Tune_LoRA_GPU.py b/Fine-Tune_LoRA_GPU.py new file mode 100644 index 0000000..7fd5257 --- /dev/null +++ b/Fine-Tune_LoRA_GPU.py @@ -0,0 +1,68 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from peft import LoraConfig, get_peft_model +from datasets import Dataset +import torch +import os + +# Load model and tokenizer from HF +model_name = "Qwen/Qwen2.5-1.5B" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # GPU + +# Prepare dataset: each .txt file as one example +content_dir = "./content" +texts = [] + +for txt_file in os.listdir(content_dir): + if txt_file.endswith(".txt"): + with open(os.path.join(content_dir, txt_file), "r", encoding="utf-8") as tf: + # Join all lines in the file into one text + text = " ".join(line.strip() for line in tf.readlines() if line.strip()) + texts.append(text) + +dataset = Dataset.from_dict({"text": texts}) +print(f"Dataset size: {len(dataset)}") # Should be ~300 + +def tokenize_function(examples): + # Tokenize the text + tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) + # Create labels for causal LM + tokenized["labels"] = tokenized["input_ids"].copy() + return tokenized + +tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) + +# Configure LoRA +lora_config = LoraConfig( + r=8, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.1, +) +model = get_peft_model(model, lora_config) + +# Training arguments +training_args = TrainingArguments( + output_dir="./fine_tuned_qwen2_5_1_5b", + per_device_train_batch_size=8, + gradient_accumulation_steps=1, + num_train_epochs=5, + learning_rate=2e-4, + save_steps=50, + logging_steps=10, + fp16=True, +) + +# Trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_dataset, +) + +# Fine-tune +trainer.train() + +# Save +model.save_pretrained("./fine_tuned_qwen2_5_1_5b") +tokenizer.save_pretrained("./fine_tuned_qwen2_5_1_5b") diff --git a/f2b_status.sh b/f2b_status.sh new file mode 100755 index 0000000..b902808 --- /dev/null +++ b/f2b_status.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# ============================================================================= +# Script Name: f2b_status.sh +# Version: 1.6 +# Description: This script retrieves and displays the status of all Fail2Ban +# jails, including error handling and logging. +# ============================================================================= + +# Log file path +LOG_FILE="/var/log/fail2ban-status.log" + +# Function to log messages with timestamp +log_message() { + echo "$(date +"%Y-%m-%d %H:%M:%S") - $1" | tee -a "$LOG_FILE" +} + +# Function to retrieve the list of jails +get_jail_list() { + fail2ban-client status | grep 'Jail list:' | cut -d ":" -f2 | tr -d ',' | xargs +} + +# Retrieve the list of jails +log_message "Retrieving the list of Fail2Ban jails..." +JAIL_LIST=$(get_jail_list) + +# Check if any jails were found +if [ -z "$JAIL_LIST" ]; then + log_message "No jails found." + exit 1 +fi + +# Convert JAIL_LIST into an array +IFS=' ' read -r -a JAIL_ARRAY <<< "$JAIL_LIST" + +# Iterate over each jail and display its status +for JAIL in "${JAIL_ARRAY[@]}"; do + log_message "Retrieving status for jail: $JAIL" + STATUS=$(fail2ban-client status "$JAIL" 2>&1) + + if echo "$STATUS" | grep -q "Sorry but the jail"; then + log_message "Failed to retrieve status for jail: $JAIL. Error: $STATUS" + else + log_message "Status for jail $JAIL retrieved successfully." + echo "Status for jail: $JAIL" + echo "$STATUS" + echo "----------------------------" + fi +done + +log_message "Fail2Ban status check completed." diff --git a/gitea_push.sh b/gitea_push.sh new file mode 100755 index 0000000..b512472 --- /dev/null +++ b/gitea_push.sh @@ -0,0 +1,208 @@ +#!/bin/zsh +# Script Version: 1.4 +# Description: Pushes the current folder (e.g. /etc) to a nested Gitea repo using provided nesting arguments. Auto-creates the remote repo via Gitea API if missing. + +# Set variables +# ======== + +# Try to extract GITEA_API_TOKEN from ~/.netrc if present +if [ -z "$GITEA_API_TOKEN" ] && grep -q '^GITEA_API_TOKEN=' ~/.netrc 2>/dev/null; then + GITEA_API_TOKEN=$(grep '^GITEA_API_TOKEN=' ~/.netrc | head -n1 | cut -d= -f2 | xargs) + export GITEA_API_TOKEN +fi + +GITEA_USER=$(awk '{for(i=1;i<=NF;i++) if($i=="login") print $(i+1)}' ~/.netrc | head -n1) +if [ -z "$GITEA_USER" ]; then + echo "[ERROR] No login found in ~/.netrc" + exit 1 +fi + +GITEA_URL="https://$(awk '{for(i=1;i<=NF;i++) if($i=="machine") print $(i+1)}' ~/.netrc | head -n1)" +if [ -z "$GITEA_URL" ]; then + echo "[ERROR] No URL found in ~/.netrc" + exit 1 +fi +GITEA_API_URL="$GITEA_URL/api/v1" + +PRIVATE=false +DEBUG=false +COMMIT_MESSAGE="Update $(date +%F_%T)" + +# Logging function +# ======== +log() { + local level="$1"; shift + if [ "$level" = "DEBUG" ] && [ "$DEBUG" != true ]; then return; fi + local color_reset="$(tput sgr0)" + local color="" + case "$level" in + INFO) color="$(tput setaf 2)" ;; # green + WARNING) color="$(tput setaf 3)" ;; # yellow + ERROR) color="$(tput setaf 1)" ;; # red + DEBUG) color="$(tput setaf 4)" ;; # blue + esac + echo "${color}[$level] $*${color_reset}" +} + +# Functions +# ======== +create_repo() { + log INFO "Repository does not exist. Creating via API: $REMOTE_PATH" + log DEBUG "POST $GITEA_API_URL/user/repos with name=$REMOTE_PATH and private=$PRIVATE" + RESPONSE=$(curl -s -X POST \ + -H "Authorization: token $GITEA_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"name\": \"$FOLDER_NAME\", \"private\": $PRIVATE}" \ + "$GITEA_API_URL/user/repos") + + if echo "$RESPONSE" | grep -q '"clone_url"'; then + log INFO "Remote repository created successfully." + else + log ERROR "Failed to create remote repository: $RESPONSE" + exit 1 + fi +} + +prepare_commit() { + git add . + if ! git rev-parse --verify HEAD >/dev/null 2>&1; then + log INFO "Creating initial commit" + git commit -m "$COMMIT_MESSAGE" + else + log INFO "Committing changes" + git commit -m "$COMMIT_MESSAGE" || log INFO "Nothing to commit" + fi +} + +setup_remote() { + if git remote | grep -q '^origin$'; then + log INFO "Updating remote origin URL" + git remote set-url origin "$GIT_REMOTE" + else + log INFO "Adding remote origin" + git remote add origin "$GIT_REMOTE" + fi +} + +push_changes() { + log INFO "Pushing to $GIT_REMOTE" + git push -u origin main +} + +# Show help if no arguments are given +# ======== +if [ $# -eq 0 ]; then + echo "GITEA_API_TOKEN=" + echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] " + echo "Example: $0 server" + echo " $0 --private workstation" + echo " $0 --debug server" + echo " $0 --message \"minor update\" server" + echo + echo "Note: You must cd into the target folder before running this script." + echo "For example:" + echo " cd /etc && $0 server" + echo + echo "Authentication:" + echo " Git uses ~/.netrc for authentication. You can create it like this:" + echo " echo \"machine \$(echo \"$GITEA_URL\" | sed 's|https\\?://||') login $GITEA_USER password \"\"\" > ~/.netrc" + echo " chmod 600 ~/.netrc" + exit 0 +fi + +# Parse arguments +# ======== +POSITIONAL_ARGS=() +while [[ $# -gt 0 ]]; do + case "$1" in + --private) + PRIVATE=true + shift + ;; + --debug) + DEBUG=true + shift + ;; + --message) + COMMIT_MESSAGE="$2" + shift 2 + ;; + *) + POSITIONAL_ARGS+=("$1") + shift + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" + +if [ $# -ne 1 ]; then + echo "Usage: $0 [--private] [--debug] [--message \"your commit message\"] " + exit 1 +fi + +HOST_GROUP=$(echo "$1" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-') +HOST_NAME=$(hostname -s | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-') +FOLDER_NAME="${HOST_NAME}-$(basename "$PWD")" +REPO_PATH="$PWD" +REMOTE_PATH="$FOLDER_NAME" +GIT_REMOTE="$GITEA_URL/$GITEA_USER/$FOLDER_NAME.git" + +# Git authentication hint +log DEBUG "Ensure ~/.netrc has: machine login $GITEA_USER password " + +# Check if GITEA_API_TOKEN is set +if [ -z "$GITEA_API_TOKEN" ]; then + log WARNING "GITEA_API_TOKEN is not set. Skipping API repo creation." +else + # Check if remote repo exists + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + -H "Authorization: token $GITEA_API_TOKEN" \ + "$GITEA_API_URL/repos/$GITEA_USER/$FOLDER_NAME") + + if [ "$HTTP_STATUS" -ne 200 ]; then + create_repo + else + log INFO "Remote repository already exists." + fi +fi + +# Main Process +# ======== + +# Safety check against pushing from / or $HOME +if [[ "$PWD" == "$HOME" || "$PWD" == "/" ]]; then + log ERROR "Refusing to run inside \$PWD=$PWD" + exit 1 +fi +log INFO "Pushing $REPO_PATH to $GIT_REMOTE" +cd "$REPO_PATH" || { log ERROR "Directory $REPO_PATH not found"; exit 1; } + +# Initialize git if needed +# Branch is fixed to 'main' for simplicity and consistency +if [ ! -d .git ]; then + log INFO "Initializing Git repo" + git init + git config init.defaultBranch main + git checkout -b main +else + log DEBUG ".git directory already present" +fi + +# Ensure at least one commit exists +prepare_commit + +# Set or update remote +if [ "$HTTP_STATUS" -eq 200 ]; then + setup_remote +else + log WARNING "Skipping remote setup – repository does not exist." +fi + +# Push to remote +if [ "$HTTP_STATUS" -eq 200 ]; then + push_changes +else + log WARNING "Skipping push – repository does not exist." +fi + diff --git a/import_embeddings.py b/import_embeddings.py new file mode 100644 index 0000000..a96689b --- /dev/null +++ b/import_embeddings.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# Script Version: 0.6 +# Description: Import existing embeddings.json into Open WebUI's ChromaDB instance using the new client API + +import os +import json +from chromadb import PersistentClient + +# Use Open WebUI's active Chroma DB directory +CHROMA_DIR = "/srv/open-webui/backend/data/vector_db" +COLLECTION_NAME = "cds_docs" +EMBEDDING_FILE = "embeddings.json" +CONTENT_DIR = "content" + +# Stop Open WebUI before running this script to avoid file lock issues +client = PersistentClient(path=CHROMA_DIR) +collection = client.get_or_create_collection(name=COLLECTION_NAME) + +# Load existing embeddings +with open(EMBEDDING_FILE, "r") as f: + embeddings_data = json.load(f) + +imported_count = 0 + +# Ingest each document +for filename, vector in embeddings_data.items(): + filepath = os.path.join(CONTENT_DIR, filename) + try: + with open(filepath, "r", encoding="utf-8") as f: + text = f.read().strip() + collection.add( + documents=[text], + metadatas=[{"filename": filename}], + ids=[filename], + embeddings=[vector] + ) + imported_count += 1 + except FileNotFoundError: + print(f"[WARN] Skipping missing file: {filepath}") + +print(f"✅ Embeddings successfully imported into Chroma: {imported_count} documents") + diff --git a/lxc_create_container.sh b/lxc_create_container.sh new file mode 100755 index 0000000..09551c9 --- /dev/null +++ b/lxc_create_container.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# ============================================================================= +# Script Name: lxc_create_container.sh +# Version: 1.1 +# Description: This script creates a new LXC container from a template, assigns +# a unique MAC address, updates the hostname and /etc/hosts file, +# and verifies internet access. +# ============================================================================= + +# Prompt for the new container hostname +read -e -p "LXCHOSTNAME: " LXCHOSTNAME +export LXCHOSTNAME + +# Check if the template container is running and stop it if necessary +if lxc-info -n template | grep -q 'RUNNING'; then + echo "Stopping the template container..." + if ! lxc-stop -n template; then + echo "Failed to stop the template container." + exit 1 + fi +else + echo "Template container is not running." +fi + +# Copy the template to create a new container with the given hostname +echo "Creating a new container with hostname: $LXCHOSTNAME..." +if ! lxc-copy -n template -N "$LXCHOSTNAME"; then + echo "Failed to copy the template container." + exit 1 +fi + +# Function to generate a unique MAC address +generate_unique_hwaddr() { + local hwaddr + local existing_hwaddrs + while : ; do + hwaddr=$(printf '00:16:3e:%02x:%02x:%02x\n' $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256))) + existing_hwaddrs=$(grep "lxc.net.0.hwaddr" /var/lib/lxc/*/config | grep "$hwaddr") + if [ -z "$existing_hwaddrs" ]; then + # MAC address is unique + echo "$hwaddr" + return + fi + done +} + +# Generate a unique MAC address +NEW_HWADDR=$(generate_unique_hwaddr) + +# Path to the LXC configuration file +CONFIG_FILE="/var/lib/lxc/$LXCHOSTNAME/config" + +# Replace the existing hwaddr line +echo "Updating MAC address in $CONFIG_FILE to $NEW_HWADDR..." +if ! sed -i "/^lxc.net.0.hwaddr/c\lxc.net.0.hwaddr = $NEW_HWADDR" "$CONFIG_FILE"; then + echo "Failed to update MAC address in $CONFIG_FILE." + exit 1 +fi + +# Start the new container +echo "Starting the new container..." +if ! lxc-start -n "$LXCHOSTNAME"; then + echo "Failed to start the container $LXCHOSTNAME." + exit 1 +fi + +# Wait for the container to start +sleep 5 + +# Change the hostname inside the container +echo "Changing the hostname inside the container..." +if ! lxc-attach -n "$LXCHOSTNAME" -- bash -c "echo '$LXCHOSTNAME' > /etc/hostname" || \ + ! lxc-attach -n "$LXCHOSTNAME" -- hostname "$LXCHOSTNAME"; then + echo "Failed to set the hostname inside the container." + exit 1 +fi + +# Update /etc/hosts +echo "Updating /etc/hosts inside the container..." +if ! lxc-attach -n "$LXCHOSTNAME" -- bash -c "echo '127.0.0.1 $LXCHOSTNAME' >> /etc/hosts"; then + echo "Failed to update /etc/hosts inside the container." + exit 1 +fi + +# Ensure the container has internet access (optional) +echo "Checking internet connectivity inside the container..." +if ! lxc-attach -n "$LXCHOSTNAME" -- ping -c 4 google.com; then + echo "Container $LXCHOSTNAME does not have internet access." +fi +echo + +# Stop and restart the container +echo "Restarting the container..." +if ! lxc-stop -n "$LXCHOSTNAME" || ! lxc-start -n "$LXCHOSTNAME"; then + echo "Failed to restart the container $LXCHOSTNAME." + exit 1 +fi + +# Display the MAC addresses to verify the changes +echo "Displaying the MAC addresses to verify the changes..." +grep lxc.net.0.hwaddr /var/lib/lxc/*/config + +# Wait and list containers to ensure they are running +sleep 9 +echo "Listing all containers..." +lxc-ls -f + +echo "LXC container setup completed successfully." diff --git a/lxc_list_login.sh b/lxc_list_login.sh new file mode 100755 index 0000000..2ee3d28 --- /dev/null +++ b/lxc_list_login.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# ============================================================================= +# Script Name: lxc_list_login.sh +# Version: 03 +# Description: Lists LXC containers, checks their statuses, and allows login. +# ============================================================================= + +# Required commands +REQUIRED_CMDS=("lxc-ls" "lxc-info" "lxc-start" "lxc-attach") + +# Check if required commands are available +for CMD in "${REQUIRED_CMDS[@]}"; do + if ! command -v "$CMD" &> /dev/null; then + echo "The command $CMD is not installed. Please install it and try again." + exit 1 + fi +done + +# List and check LXC containers +echo "List of all LXC containers:" +CONTAINERS=($(lxc-ls -f | awk 'NR>1 && $1 != "" {print $1}')) + +# Check if there are any containers +if [[ ${#CONTAINERS[@]} -eq 0 ]]; then + echo "There are no LXC containers." + exit 1 +fi + +# Display containers and their status +printf "\n%-5s %-20s %-10s\n" "Index" "Container Name" "Status" +echo "------------------------------------------" +for (( I=0; I<${#CONTAINERS[@]}; I++ )); do + LXCHOSTNAME="${CONTAINERS[$I]}" + if [[ -n "$LXCHOSTNAME" ]]; then + STATUS=$(lxc-info --name="$LXCHOSTNAME" | grep "State" | awk '{print $2}') + printf "%-5d %-20s %-10s\n" "$I" "$LXCHOSTNAME" "$STATUS" + fi +done + +# Prompt user to select a container +read -p "Select a container to log in (0-$(( ${#CONTAINERS[@]} - 1 ))): " SELECTION + +# Validate selection +if [[ $SELECTION =~ ^[0-9]+$ ]] && [[ $SELECTION -ge 0 && $SELECTION -lt ${#CONTAINERS[@]} ]]; then + LXCHOSTNAME="${CONTAINERS[$SELECTION]}" + STATUS=$(lxc-info --name="$LXCHOSTNAME" | grep "State" | awk '{print $2}') + + if [[ $STATUS == "STOPPED" ]]; then + read -p "Container $LXCHOSTNAME is stopped. Do you want to start it? (y/n) " START_SELECTION + if [[ $START_SELECTION == "y" ]]; then + echo "Starting the container $LXCHOSTNAME..." + if lxc-start --name="$LXCHOSTNAME"; then + echo "Container $LXCHOSTNAME has been started." + for i in {1..10}; do + STATUS=$(lxc-info --name="$LXCHOSTNAME" | grep "State" | awk '{print $2}') + if [[ $STATUS == "RUNNING" ]]; then + break + fi + sleep 1 + done + if [[ $STATUS != "RUNNING" ]]; then + echo "Container $LXCHOSTNAME failed to start within the timeout period." + exit 1 + fi + else + echo "Error starting the container $LXCHOSTNAME." + exit 1 + fi + else + echo "Container $LXCHOSTNAME was not started." + exit 1 + fi + fi + echo "Logging into the container $LXCHOSTNAME..." + if ! lxc-attach --name="$LXCHOSTNAME"; then + echo "Error logging into the container $LXCHOSTNAME." + exit 1 + fi +else + echo "Invalid selection. Please run the script again and choose a valid number." + exit 1 +fi diff --git a/raw_training_data.py b/raw_training_data.py new file mode 100644 index 0000000..87c2e56 --- /dev/null +++ b/raw_training_data.py @@ -0,0 +1,11 @@ +import os + +content_dir = "./content" +output_file = "raw_training_data.txt" + +with open(output_file, "w", encoding="utf-8") as f: + for txt_file in os.listdir(content_dir): + if txt_file.endswith(".txt"): + with open(os.path.join(content_dir, txt_file), "r", encoding="utf-8") as tf: + text = tf.read().strip() + f.write(text + "\n") # One text per line diff --git a/scraper.sh b/scraper.sh new file mode 100755 index 0000000..c9bd32e --- /dev/null +++ b/scraper.sh @@ -0,0 +1,189 @@ +#!/bin/bash +# Script Version: 01.8 +# Description: Scrapes and extracts page text from MediaWiki pages, cleans image artifacts, and deletes empty results + +# Constants +DOWNLOAD_TIMEOUT=10 +TEXT_FILE_SUFFIX=".txt" +LINK_FILE_SUFFIX=".txt" + +# Function to convert relative URLs to absolute URLs +resolve_url() { + local base_url=$1 + local relative_url=$2 + + if [[ "$relative_url" =~ ^https?:// ]]; then + echo "$relative_url" + elif [[ "$relative_url" =~ ^/ ]]; then + echo "${base_url}${relative_url}" + else + echo "${base_url}/${relative_url}" + fi +} + +# Function to check if URL should be skipped +should_skip_url() { + local url=$1 + case "$url" in + *"load.php"*|*"IE9fixes.css"*|*"favicon.ico"*|*"opensearch_desc.php"*|*"api.php?action="*|*"Special:RecentChanges"*|*"Special:UserLogin"*|*"Special:RequestAccount"*|*"Dioxipedia:Privacy_policy"*|*"javascript:print();"*|*"mediawiki.org"*) + return 0 ;; # true, should skip + *) + return 1 ;; # false, don't skip + esac +} + +# Function to download content into a variable with timeout and error checking +download_content() { + local url=$1 + local exclude_file=$2 + + if should_skip_url "$url"; then + echo "Skipping known irrelevant URL: $url" + return 1 + fi + + if [ -f "$exclude_file" ] && grep -Fx "$url" "$exclude_file" > /dev/null; then + echo "Skipping excluded URL: $url" + return 1 + fi + + echo "Downloading: $url" + SITECACHE=$(wget -T "$DOWNLOAD_TIMEOUT" -q -O - "$url" 2>/dev/null) + if [ $? -ne 0 ] || [ -z "$SITECACHE" ]; then + echo -e "\033[31m[ ERROR ]:\033[0m Failed to download $url" >&2 + echo "$url" >> "$exclude_file" + return 1 + fi + if ! echo "$SITECACHE" | grep -q "> "$exclude_file" + return 1 + fi + sleep 1 + echo "Successfully downloaded: $url" + return 0 +} + +# Improved extraction function using pup and lynx +extract_text() { + local output_file=$1 + local url=$2 + local exclude_file=$3 + + echo "Extracting text from SITECACHE to $output_file" + + EXTRACTED=$(echo "$SITECACHE" | pup '#mw-content-text' 2>/dev/null) + + if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then + echo "INFO: Content empty with #mw-content-text, trying #bodyContent" + EXTRACTED=$(echo "$SITECACHE" | pup '#bodyContent' 2>/dev/null) + fi + + if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then + echo "WARNING: Still no content after both selectors." + echo "$url" >> "$exclude_file" + return 1 + fi + + echo "$EXTRACTED" | lynx -stdin -dump -nolist > "$output_file" + + if [ ! -s "$output_file" ]; then + echo "WARNING: No text extracted from $url after lynx" + echo "$url" >> "$exclude_file" + rm -f "$output_file" + return 1 + fi + + # Remove lines containing image artifacts like [something.jpg] or [something.png] + sed -i '/\[.*\(jpg\|jpeg\|png\).*]/Id' "$output_file" + + # Delete if file is smaller than 100 bytes + if [ $(stat -c%s "$output_file") -lt 100 ]; then + echo "INFO: Deleted $output_file (under 100 bytes)" + rm -f "$output_file" + echo "$url" >> "$exclude_file" + return 1 + fi + + echo "Successfully extracted text to $output_file" + return 0 +} + +# Function to extract page title +extract_title() { + echo "$SITECACHE" | grep -oP '(?<=).*(?=)' | head -n 1 | sed 's/ - dioxipedia$//' | sed 's/[^a-zA-Z0-9-]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//' +} + +# Function to extract links +extract_links() { + local output_file=$1 + + echo "$SITECACHE" | grep -oP '(?<=href=")[^"]+' | grep -v 'translate\.goog' > "$output_file" + if [ $? -ne 0 ] || [ ! -s "$output_file" ]; then + echo "WARNING: No links extracted" + rm -f "$output_file" + return 1 + fi + echo "Successfully extracted links to $output_file" + return 0 +} + +# Main script logic +if [ $# -ne 1 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +INITIAL_URL=$1 +DOMAIN=$(echo "$INITIAL_URL" | awk -F[/:] '{print $4}') +BASE_URL="https://$DOMAIN" +ALL_PAGES_URL="$BASE_URL/index.php?title=Special:AllPages" +LINKSFILE="$DOMAIN/links$LINK_FILE_SUFFIX" +EXCLUDE_FILE="$DOMAIN/exclude.txt" +CONTENT_DIR="$DOMAIN/content" + +mkdir -p "$DOMAIN" +mkdir -p "$CONTENT_DIR" + +# Step 1: Collect links +if ! download_content "$ALL_PAGES_URL" "$EXCLUDE_FILE"; then + echo "Failed to download $ALL_PAGES_URL" + exit 1 +fi + +if ! extract_links "$LINKSFILE"; then + echo "Failed to extract links" + exit 1 +fi + +# Step 2: Process links +PROCESSED_URLS=() +while IFS= read -r link; do + URL=$(resolve_url "$BASE_URL" "$link") + + if [[ " ${PROCESSED_URLS[*]} " =~ " $URL " ]]; then + echo "Skipping processed URL: $URL" + continue + fi + + if ! download_content "$URL" "$EXCLUDE_FILE"; then + PROCESSED_URLS+=("$URL") + continue + fi + + PAGENAME=$(extract_title) + [ -z "$PAGENAME" ] && PAGENAME="page" + + TEXTFILE="$CONTENT_DIR/$PAGENAME$TEXT_FILE_SUFFIX" + + if ! extract_text "$TEXTFILE" "$URL" "$EXCLUDE_FILE"; then + PROCESSED_URLS+=("$URL") + continue + fi + + PROCESSED_URLS+=("$URL") +done < "$LINKSFILE" + +echo "Processing complete." +exit 0 + diff --git a/sendmail_test.sh b/sendmail_test.sh new file mode 100755 index 0000000..d42e08d --- /dev/null +++ b/sendmail_test.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Script Name: sendmail_test.sh +# Version: 03 +# Description: This script sends a test email using sendmail. The recipient's email address is the first argument. +# It logs messages to the console only. + +# Check if an argument (email address) is provided +if [ -z "$1" ]; then + TO="root" +else + TO="$1" +fi + +# Email details +SUBJECT="Postfix Test" +FROM="$(whoami)@$(hostname)" +BODY="This is the email body!" + +# Function to send email +send_email() { + if ! command -v sendmail &> /dev/null; then + echo "Sendmail is not installed or configured. Please ensure sendmail is installed and properly set up." >&2 + exit 1 + fi + + sendmail -t <> "$LOG_FILE" +} + +info() { + if [[ "$INTERACTIVE" = true ]]; then + echo "$1" + fi +} + +# Output FastCGI headers if applicable +if [ -t 0 ]; then + echo "Content-Type: text/plain" + echo "" +fi + +# Ensure the token file exists +if [ ! -f "$TOKEN_FILE" ]; then + log "ERROR: Token file not found." + exit 1 +fi + +# Read the token +TOKEN=$(< "$TOKEN_FILE") +if [ -z "$TOKEN" ]; then + log "ERROR: Token is empty." + exit 1 +fi + +# Log the token retrieval +log "INFO: Token retrieved for update." + +# Fetch the current public IP from the external service +IP_CURL=$(curl -s http://ip.dynproxy.net) +if [ -z "$IP_CURL" ]; then + log "ERROR: Failed to fetch current public IP." + exit 1 +fi + +# Ensure the IP file exists +if [ ! -f "$IP_FILE" ]; then + log "INFO: IP file not found. Creating a new one with current IP." + echo "$IP_CURL" > "$IP_FILE" + log "INFO: IP file created with initial IP $IP_CURL." + info "Initial IP file created with IP: $IP_CURL" + PREVIOUS_IP="" # Set to empty to force update logic +else + # Read the previous IP from the IP file + PREVIOUS_IP=$(< "$IP_FILE") +fi + +# Compare the current IP with the previous IP +if [ "$IP_CURL" != "$PREVIOUS_IP" ]; then + log "INFO: IP has changed from $PREVIOUS_IP to $IP_CURL. Proceeding with DNS update." + + # Log the IP to be updated + log "INFO: Updating DNS for IP $IP_CURL." + + # Post the token and IP to trigger the DNS zone update + RESPONSE=$(curl -s -o /tmp/curl_output -w "%{http_code}" -X POST \ + -H "Content-Type: application/x-www-form-urlencoded" \ + --data-urlencode "token=$TOKEN" \ + --data-urlencode "ip=$IP_CURL" \ + $UPDATE_URL) + + # Log the response and result + if [ "$RESPONSE" -eq 200 ]; then + log "SUCCESS: DNS zone update triggered successfully for token $TOKEN and IP $IP_CURL." + info "DNS zone update triggered successfully" + # Write the new IP to the IP file + echo "$IP_CURL" > "$IP_FILE" + else + log "ERROR: Failed to trigger DNS zone update for token $TOKEN and IP $IP_CURL. Response code: $RESPONSE. Response body: $(cat /tmp/curl_output)" + info "Failed to trigger DNS zone update. HTTP response: $RESPONSE" + exit 1 + fi +else + log "INFO: IP has not changed. No update needed." + info "IP has not changed. No update needed." +fi + +# Cleanup temporary files +rm -f /tmp/curl_output + diff --git a/wrap_embeddings.sh b/wrap_embeddings.sh new file mode 100755 index 0000000..a905c93 --- /dev/null +++ b/wrap_embeddings.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Script Version: 0.3 +# Description: Convert each .txt in content/ to .json with embedding in json/ + +# Set variables +CONTENT_DIR="./content" +JSON_DIR="./json" +EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2" + +# Check dependencies +if ! python3 -c "import sentence_transformers" 2>/dev/null; then + echo "[ERROR] ❌ sentence-transformers not installed. Run: pip3 install sentence-transformers" + exit 1 +fi + +# Check input files +mkdir -p "$JSON_DIR" +if [ ! -d "$CONTENT_DIR" ] || ! ls "$CONTENT_DIR"/*.txt >/dev/null 2>&1; then + echo "[ERROR] ❌ No .txt files found in $CONTENT_DIR" + exit 1 +fi + +# Generate embeddings +python3 -c " +import sys, json, os +from sentence_transformers import SentenceTransformer +content_dir, json_dir = sys.argv[1], sys.argv[2] +model = SentenceTransformer('${EMBEDDING_MODEL}') +for txt_file in os.listdir(content_dir): + if txt_file.endswith('.txt'): + base_name = txt_file[:-4] + try: + with open(os.path.join(content_dir, txt_file), 'r', encoding='utf-8') as f: + text = f.read() + embedding = model.encode([text])[0].tolist() + with open(os.path.join(json_dir, f'{base_name}.json'), 'w') as f: + json.dump({'id': base_name, 'text': text, 'embedding': embedding}, f) + print(f'[DEBUG] ✅ Saved: {json_dir}/{base_name}.json') + except Exception as e: + print(f'[ERROR] ❌ Failed: {txt_file} - {str(e)}', file=sys.stderr) +" "$CONTENT_DIR" "$JSON_DIR" 2>&1 | while read -r line; do echo "$line"; done + +echo "✅ All .txt files converted to JSON with embeddings in $JSON_DIR"