#!/bin/bash # Script Version: 01.8 # Description: Scrapes and extracts page text from MediaWiki pages, cleans image artifacts, and deletes empty results # Constants DOWNLOAD_TIMEOUT=10 TEXT_FILE_SUFFIX=".txt" LINK_FILE_SUFFIX=".txt" # Function to convert relative URLs to absolute URLs resolve_url() { local base_url=$1 local relative_url=$2 if [[ "$relative_url" =~ ^https?:// ]]; then echo "$relative_url" elif [[ "$relative_url" =~ ^/ ]]; then echo "${base_url}${relative_url}" else echo "${base_url}/${relative_url}" fi } # Function to check if URL should be skipped should_skip_url() { local url=$1 case "$url" in *"load.php"*|*"IE9fixes.css"*|*"favicon.ico"*|*"opensearch_desc.php"*|*"api.php?action="*|*"Special:RecentChanges"*|*"Special:UserLogin"*|*"Special:RequestAccount"*|*"Dioxipedia:Privacy_policy"*|*"javascript:print();"*|*"mediawiki.org"*) return 0 ;; # true, should skip *) return 1 ;; # false, don't skip esac } # Function to download content into a variable with timeout and error checking download_content() { local url=$1 local exclude_file=$2 if should_skip_url "$url"; then echo "Skipping known irrelevant URL: $url" return 1 fi if [ -f "$exclude_file" ] && grep -Fx "$url" "$exclude_file" > /dev/null; then echo "Skipping excluded URL: $url" return 1 fi echo "Downloading: $url" SITECACHE=$(wget -T "$DOWNLOAD_TIMEOUT" -q -O - "$url" 2>/dev/null) if [ $? -ne 0 ] || [ -z "$SITECACHE" ]; then echo -e "\033[31m[ ERROR ]:\033[0m Failed to download $url" >&2 echo "$url" >> "$exclude_file" return 1 fi if ! echo "$SITECACHE" | grep -q "> "$exclude_file" return 1 fi sleep 1 echo "Successfully downloaded: $url" return 0 } # Improved extraction function using pup and lynx extract_text() { local output_file=$1 local url=$2 local exclude_file=$3 echo "Extracting text from SITECACHE to $output_file" EXTRACTED=$(echo "$SITECACHE" | pup '#mw-content-text' 2>/dev/null) if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then echo "INFO: Content empty with #mw-content-text, trying #bodyContent" EXTRACTED=$(echo "$SITECACHE" | pup '#bodyContent' 2>/dev/null) fi if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then echo "WARNING: Still no content after both selectors." echo "$url" >> "$exclude_file" return 1 fi echo "$EXTRACTED" | lynx -stdin -dump -nolist > "$output_file" if [ ! -s "$output_file" ]; then echo "WARNING: No text extracted from $url after lynx" echo "$url" >> "$exclude_file" rm -f "$output_file" return 1 fi # Remove lines containing image artifacts like [something.jpg] or [something.png] sed -i '/\[.*\(jpg\|jpeg\|png\).*]/Id' "$output_file" # Delete if file is smaller than 100 bytes if [ $(stat -c%s "$output_file") -lt 100 ]; then echo "INFO: Deleted $output_file (under 100 bytes)" rm -f "$output_file" echo "$url" >> "$exclude_file" return 1 fi echo "Successfully extracted text to $output_file" return 0 } # Function to extract page title extract_title() { echo "$SITECACHE" | grep -oP '(?<=).*(?=)' | head -n 1 | sed 's/ - dioxipedia$//' | sed 's/[^a-zA-Z0-9-]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//' } # Function to extract links extract_links() { local output_file=$1 echo "$SITECACHE" | grep -oP '(?<=href=")[^"]+' | grep -v 'translate\.goog' > "$output_file" if [ $? -ne 0 ] || [ ! -s "$output_file" ]; then echo "WARNING: No links extracted" rm -f "$output_file" return 1 fi echo "Successfully extracted links to $output_file" return 0 } # Main script logic if [ $# -ne 1 ]; then echo "Usage: $0 " >&2 exit 1 fi INITIAL_URL=$1 DOMAIN=$(echo "$INITIAL_URL" | awk -F[/:] '{print $4}') BASE_URL="https://$DOMAIN" ALL_PAGES_URL="$BASE_URL/index.php?title=Special:AllPages" LINKSFILE="$DOMAIN/links$LINK_FILE_SUFFIX" EXCLUDE_FILE="$DOMAIN/exclude.txt" CONTENT_DIR="$DOMAIN/content" mkdir -p "$DOMAIN" mkdir -p "$CONTENT_DIR" # Step 1: Collect links if ! download_content "$ALL_PAGES_URL" "$EXCLUDE_FILE"; then echo "Failed to download $ALL_PAGES_URL" exit 1 fi if ! extract_links "$LINKSFILE"; then echo "Failed to extract links" exit 1 fi # Step 2: Process links PROCESSED_URLS=() while IFS= read -r link; do URL=$(resolve_url "$BASE_URL" "$link") if [[ " ${PROCESSED_URLS[*]} " =~ " $URL " ]]; then echo "Skipping processed URL: $URL" continue fi if ! download_content "$URL" "$EXCLUDE_FILE"; then PROCESSED_URLS+=("$URL") continue fi PAGENAME=$(extract_title) [ -z "$PAGENAME" ] && PAGENAME="page" TEXTFILE="$CONTENT_DIR/$PAGENAME$TEXT_FILE_SUFFIX" if ! extract_text "$TEXTFILE" "$URL" "$EXCLUDE_FILE"; then PROCESSED_URLS+=("$URL") continue fi PROCESSED_URLS+=("$URL") done < "$LINKSFILE" echo "Processing complete." exit 0