Files
2025-04-13 15:16:39 +02:00

190 lines
5.2 KiB
Bash
Executable File

#!/bin/bash
# Script Version: 01.8
# Description: Scrapes and extracts page text from MediaWiki pages, cleans image artifacts, and deletes empty results
# Constants
DOWNLOAD_TIMEOUT=10
TEXT_FILE_SUFFIX=".txt"
LINK_FILE_SUFFIX=".txt"
# Function to convert relative URLs to absolute URLs
resolve_url() {
local base_url=$1
local relative_url=$2
if [[ "$relative_url" =~ ^https?:// ]]; then
echo "$relative_url"
elif [[ "$relative_url" =~ ^/ ]]; then
echo "${base_url}${relative_url}"
else
echo "${base_url}/${relative_url}"
fi
}
# Function to check if URL should be skipped
should_skip_url() {
local url=$1
case "$url" in
*"load.php"*|*"IE9fixes.css"*|*"favicon.ico"*|*"opensearch_desc.php"*|*"api.php?action="*|*"Special:RecentChanges"*|*"Special:UserLogin"*|*"Special:RequestAccount"*|*"Dioxipedia:Privacy_policy"*|*"javascript:print();"*|*"mediawiki.org"*)
return 0 ;; # true, should skip
*)
return 1 ;; # false, don't skip
esac
}
# Function to download content into a variable with timeout and error checking
download_content() {
local url=$1
local exclude_file=$2
if should_skip_url "$url"; then
echo "Skipping known irrelevant URL: $url"
return 1
fi
if [ -f "$exclude_file" ] && grep -Fx "$url" "$exclude_file" > /dev/null; then
echo "Skipping excluded URL: $url"
return 1
fi
echo "Downloading: $url"
SITECACHE=$(wget -T "$DOWNLOAD_TIMEOUT" -q -O - "$url" 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$SITECACHE" ]; then
echo -e "\033[31m[ ERROR ]:\033[0m Failed to download $url" >&2
echo "$url" >> "$exclude_file"
return 1
fi
if ! echo "$SITECACHE" | grep -q "<html"; then
echo "Skipping: $url (not HTML)"
echo "$url" >> "$exclude_file"
return 1
fi
sleep 1
echo "Successfully downloaded: $url"
return 0
}
# Improved extraction function using pup and lynx
extract_text() {
local output_file=$1
local url=$2
local exclude_file=$3
echo "Extracting text from SITECACHE to $output_file"
EXTRACTED=$(echo "$SITECACHE" | pup '#mw-content-text' 2>/dev/null)
if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then
echo "INFO: Content empty with #mw-content-text, trying #bodyContent"
EXTRACTED=$(echo "$SITECACHE" | pup '#bodyContent' 2>/dev/null)
fi
if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then
echo "WARNING: Still no content after both selectors."
echo "$url" >> "$exclude_file"
return 1
fi
echo "$EXTRACTED" | lynx -stdin -dump -nolist > "$output_file"
if [ ! -s "$output_file" ]; then
echo "WARNING: No text extracted from $url after lynx"
echo "$url" >> "$exclude_file"
rm -f "$output_file"
return 1
fi
# Remove lines containing image artifacts like [something.jpg] or [something.png]
sed -i '/\[.*\(jpg\|jpeg\|png\).*]/Id' "$output_file"
# Delete if file is smaller than 100 bytes
if [ $(stat -c%s "$output_file") -lt 100 ]; then
echo "INFO: Deleted $output_file (under 100 bytes)"
rm -f "$output_file"
echo "$url" >> "$exclude_file"
return 1
fi
echo "Successfully extracted text to $output_file"
return 0
}
# Function to extract page title
extract_title() {
echo "$SITECACHE" | grep -oP '(?<=<title>).*(?=</title>)' | head -n 1 | sed 's/ - dioxipedia$//' | sed 's/[^a-zA-Z0-9-]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//'
}
# Function to extract links
extract_links() {
local output_file=$1
echo "$SITECACHE" | grep -oP '(?<=href=")[^"]+' | grep -v 'translate\.goog' > "$output_file"
if [ $? -ne 0 ] || [ ! -s "$output_file" ]; then
echo "WARNING: No links extracted"
rm -f "$output_file"
return 1
fi
echo "Successfully extracted links to $output_file"
return 0
}
# Main script logic
if [ $# -ne 1 ]; then
echo "Usage: $0 <URL>" >&2
exit 1
fi
INITIAL_URL=$1
DOMAIN=$(echo "$INITIAL_URL" | awk -F[/:] '{print $4}')
BASE_URL="https://$DOMAIN"
ALL_PAGES_URL="$BASE_URL/index.php?title=Special:AllPages"
LINKSFILE="$DOMAIN/links$LINK_FILE_SUFFIX"
EXCLUDE_FILE="$DOMAIN/exclude.txt"
CONTENT_DIR="$DOMAIN/content"
mkdir -p "$DOMAIN"
mkdir -p "$CONTENT_DIR"
# Step 1: Collect links
if ! download_content "$ALL_PAGES_URL" "$EXCLUDE_FILE"; then
echo "Failed to download $ALL_PAGES_URL"
exit 1
fi
if ! extract_links "$LINKSFILE"; then
echo "Failed to extract links"
exit 1
fi
# Step 2: Process links
PROCESSED_URLS=()
while IFS= read -r link; do
URL=$(resolve_url "$BASE_URL" "$link")
if [[ " ${PROCESSED_URLS[*]} " =~ " $URL " ]]; then
echo "Skipping processed URL: $URL"
continue
fi
if ! download_content "$URL" "$EXCLUDE_FILE"; then
PROCESSED_URLS+=("$URL")
continue
fi
PAGENAME=$(extract_title)
[ -z "$PAGENAME" ] && PAGENAME="page"
TEXTFILE="$CONTENT_DIR/$PAGENAME$TEXT_FILE_SUFFIX"
if ! extract_text "$TEXTFILE" "$URL" "$EXCLUDE_FILE"; then
PROCESSED_URLS+=("$URL")
continue
fi
PROCESSED_URLS+=("$URL")
done < "$LINKSFILE"
echo "Processing complete."
exit 0