at1-server-scripts/scraper.sh

#!/bin/bash
# Script Version: 01.8
# Description: Scrapes and extracts page text from MediaWiki pages, cleans image artifacts, and deletes empty results

# Constants
DOWNLOAD_TIMEOUT=10
TEXT_FILE_SUFFIX=".txt"
LINK_FILE_SUFFIX=".txt"

# Function to convert relative URLs to absolute URLs
resolve_url() {
    local base_url=$1
    local relative_url=$2

    if [[ "$relative_url" =~ ^https?:// ]]; then
        echo "$relative_url"
    elif [[ "$relative_url" =~ ^/ ]]; then
        echo "${base_url}${relative_url}"
    else
        echo "${base_url}/${relative_url}"
    fi
}

# Function to check if URL should be skipped
should_skip_url() {
    local url=$1
    case "$url" in
        *"load.php"*|*"IE9fixes.css"*|*"favicon.ico"*|*"opensearch_desc.php"*|*"api.php?action="*|*"Special:RecentChanges"*|*"Special:UserLogin"*|*"Special:RequestAccount"*|*"Dioxipedia:Privacy_policy"*|*"javascript:print();"*|*"mediawiki.org"*)
            return 0 ;;  # true, should skip
        *)
            return 1 ;;  # false, don't skip
    esac
}

# Function to download content into a variable with timeout and error checking
download_content() {
    local url=$1
    local exclude_file=$2

    if should_skip_url "$url"; then
        echo "Skipping known irrelevant URL: $url"
        return 1
    fi

    if [ -f "$exclude_file" ] && grep -Fx "$url" "$exclude_file" > /dev/null; then
        echo "Skipping excluded URL: $url"
        return 1
    fi

    echo "Downloading: $url"
    SITECACHE=$(wget -T "$DOWNLOAD_TIMEOUT" -q -O - "$url" 2>/dev/null)
    if [ $? -ne 0 ] || [ -z "$SITECACHE" ]; then
        echo -e "\033[31m[ ERROR ]:\033[0m Failed to download $url" >&2
        echo "$url" >> "$exclude_file"
        return 1
    fi
    if ! echo "$SITECACHE" | grep -q "<html"; then
        echo "Skipping: $url (not HTML)"
        echo "$url" >> "$exclude_file"
        return 1
    fi
    sleep 1
    echo "Successfully downloaded: $url"
    return 0
}

# Improved extraction function using pup and lynx
extract_text() {
    local output_file=$1
    local url=$2
    local exclude_file=$3

    echo "Extracting text from SITECACHE to $output_file"

    EXTRACTED=$(echo "$SITECACHE" | pup '#mw-content-text' 2>/dev/null)

    if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then
        echo "INFO: Content empty with #mw-content-text, trying #bodyContent"
        EXTRACTED=$(echo "$SITECACHE" | pup '#bodyContent' 2>/dev/null)
    fi

    if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then
        echo "WARNING: Still no content after both selectors."
        echo "$url" >> "$exclude_file"
        return 1
    fi

    echo "$EXTRACTED" | lynx -stdin -dump -nolist > "$output_file"

    if [ ! -s "$output_file" ]; then
        echo "WARNING: No text extracted from $url after lynx"
        echo "$url" >> "$exclude_file"
        rm -f "$output_file"
        return 1
    fi

    # Remove lines containing image artifacts like [something.jpg] or [something.png]
    sed -i '/\[.*\(jpg\|jpeg\|png\).*]/Id' "$output_file"

    # Delete if file is smaller than 100 bytes
    if [ $(stat -c%s "$output_file") -lt 100 ]; then
        echo "INFO: Deleted $output_file (under 100 bytes)"
        rm -f "$output_file"
        echo "$url" >> "$exclude_file"
        return 1
    fi

    echo "Successfully extracted text to $output_file"
    return 0
}

# Function to extract page title
extract_title() {
    echo "$SITECACHE" | grep -oP '(?<=<title>).*(?=</title>)' | head -n 1 | sed 's/ - dioxipedia$//' | sed 's/[^a-zA-Z0-9-]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//'
}

# Function to extract links
extract_links() {
    local output_file=$1

    echo "$SITECACHE" | grep -oP '(?<=href=")[^"]+' | grep -v 'translate\.goog' > "$output_file"
    if [ $? -ne 0 ] || [ ! -s "$output_file" ]; then
        echo "WARNING: No links extracted"
        rm -f "$output_file"
        return 1
    fi
    echo "Successfully extracted links to $output_file"
    return 0
}

# Main script logic
if [ $# -ne 1 ]; then
    echo "Usage: $0 <URL>" >&2
    exit 1
fi

INITIAL_URL=$1
DOMAIN=$(echo "$INITIAL_URL" | awk -F[/:] '{print $4}')
BASE_URL="https://$DOMAIN"
ALL_PAGES_URL="$BASE_URL/index.php?title=Special:AllPages"
LINKSFILE="$DOMAIN/links$LINK_FILE_SUFFIX"
EXCLUDE_FILE="$DOMAIN/exclude.txt"
CONTENT_DIR="$DOMAIN/content"

mkdir -p "$DOMAIN"
mkdir -p "$CONTENT_DIR"

# Step 1: Collect links
if ! download_content "$ALL_PAGES_URL" "$EXCLUDE_FILE"; then
    echo "Failed to download $ALL_PAGES_URL"
    exit 1
fi

if ! extract_links "$LINKSFILE"; then
    echo "Failed to extract links"
    exit 1
fi

# Step 2: Process links
PROCESSED_URLS=()
while IFS= read -r link; do
    URL=$(resolve_url "$BASE_URL" "$link")

    if [[ " ${PROCESSED_URLS[*]} " =~ " $URL " ]]; then
        echo "Skipping processed URL: $URL"
        continue
    fi

    if ! download_content "$URL" "$EXCLUDE_FILE"; then
        PROCESSED_URLS+=("$URL")
        continue
    fi

    PAGENAME=$(extract_title)
    [ -z "$PAGENAME" ] && PAGENAME="page"

    TEXTFILE="$CONTENT_DIR/$PAGENAME$TEXT_FILE_SUFFIX"

    if ! extract_text "$TEXTFILE" "$URL" "$EXCLUDE_FILE"; then
        PROCESSED_URLS+=("$URL")
        continue
    fi

    PROCESSED_URLS+=("$URL")
done < "$LINKSFILE"

echo "Processing complete."
exit 0