190 lines
5.2 KiB
Bash
Executable File
190 lines
5.2 KiB
Bash
Executable File
#!/bin/bash
|
|
# Script Version: 01.8
|
|
# Description: Scrapes and extracts page text from MediaWiki pages, cleans image artifacts, and deletes empty results
|
|
|
|
# Constants
|
|
DOWNLOAD_TIMEOUT=10
|
|
TEXT_FILE_SUFFIX=".txt"
|
|
LINK_FILE_SUFFIX=".txt"
|
|
|
|
# Function to convert relative URLs to absolute URLs
|
|
resolve_url() {
|
|
local base_url=$1
|
|
local relative_url=$2
|
|
|
|
if [[ "$relative_url" =~ ^https?:// ]]; then
|
|
echo "$relative_url"
|
|
elif [[ "$relative_url" =~ ^/ ]]; then
|
|
echo "${base_url}${relative_url}"
|
|
else
|
|
echo "${base_url}/${relative_url}"
|
|
fi
|
|
}
|
|
|
|
# Function to check if URL should be skipped
|
|
should_skip_url() {
|
|
local url=$1
|
|
case "$url" in
|
|
*"load.php"*|*"IE9fixes.css"*|*"favicon.ico"*|*"opensearch_desc.php"*|*"api.php?action="*|*"Special:RecentChanges"*|*"Special:UserLogin"*|*"Special:RequestAccount"*|*"Dioxipedia:Privacy_policy"*|*"javascript:print();"*|*"mediawiki.org"*)
|
|
return 0 ;; # true, should skip
|
|
*)
|
|
return 1 ;; # false, don't skip
|
|
esac
|
|
}
|
|
|
|
# Function to download content into a variable with timeout and error checking
|
|
download_content() {
|
|
local url=$1
|
|
local exclude_file=$2
|
|
|
|
if should_skip_url "$url"; then
|
|
echo "Skipping known irrelevant URL: $url"
|
|
return 1
|
|
fi
|
|
|
|
if [ -f "$exclude_file" ] && grep -Fx "$url" "$exclude_file" > /dev/null; then
|
|
echo "Skipping excluded URL: $url"
|
|
return 1
|
|
fi
|
|
|
|
echo "Downloading: $url"
|
|
SITECACHE=$(wget -T "$DOWNLOAD_TIMEOUT" -q -O - "$url" 2>/dev/null)
|
|
if [ $? -ne 0 ] || [ -z "$SITECACHE" ]; then
|
|
echo -e "\033[31m[ ERROR ]:\033[0m Failed to download $url" >&2
|
|
echo "$url" >> "$exclude_file"
|
|
return 1
|
|
fi
|
|
if ! echo "$SITECACHE" | grep -q "<html"; then
|
|
echo "Skipping: $url (not HTML)"
|
|
echo "$url" >> "$exclude_file"
|
|
return 1
|
|
fi
|
|
sleep 1
|
|
echo "Successfully downloaded: $url"
|
|
return 0
|
|
}
|
|
|
|
# Improved extraction function using pup and lynx
|
|
extract_text() {
|
|
local output_file=$1
|
|
local url=$2
|
|
local exclude_file=$3
|
|
|
|
echo "Extracting text from SITECACHE to $output_file"
|
|
|
|
EXTRACTED=$(echo "$SITECACHE" | pup '#mw-content-text' 2>/dev/null)
|
|
|
|
if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then
|
|
echo "INFO: Content empty with #mw-content-text, trying #bodyContent"
|
|
EXTRACTED=$(echo "$SITECACHE" | pup '#bodyContent' 2>/dev/null)
|
|
fi
|
|
|
|
if [ -z "$(echo "$EXTRACTED" | sed '/^\s*$/d')" ]; then
|
|
echo "WARNING: Still no content after both selectors."
|
|
echo "$url" >> "$exclude_file"
|
|
return 1
|
|
fi
|
|
|
|
echo "$EXTRACTED" | lynx -stdin -dump -nolist > "$output_file"
|
|
|
|
if [ ! -s "$output_file" ]; then
|
|
echo "WARNING: No text extracted from $url after lynx"
|
|
echo "$url" >> "$exclude_file"
|
|
rm -f "$output_file"
|
|
return 1
|
|
fi
|
|
|
|
# Remove lines containing image artifacts like [something.jpg] or [something.png]
|
|
sed -i '/\[.*\(jpg\|jpeg\|png\).*]/Id' "$output_file"
|
|
|
|
# Delete if file is smaller than 100 bytes
|
|
if [ $(stat -c%s "$output_file") -lt 100 ]; then
|
|
echo "INFO: Deleted $output_file (under 100 bytes)"
|
|
rm -f "$output_file"
|
|
echo "$url" >> "$exclude_file"
|
|
return 1
|
|
fi
|
|
|
|
echo "Successfully extracted text to $output_file"
|
|
return 0
|
|
}
|
|
|
|
# Function to extract page title
|
|
extract_title() {
|
|
echo "$SITECACHE" | grep -oP '(?<=<title>).*(?=</title>)' | head -n 1 | sed 's/ - dioxipedia$//' | sed 's/[^a-zA-Z0-9-]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//'
|
|
}
|
|
|
|
# Function to extract links
|
|
extract_links() {
|
|
local output_file=$1
|
|
|
|
echo "$SITECACHE" | grep -oP '(?<=href=")[^"]+' | grep -v 'translate\.goog' > "$output_file"
|
|
if [ $? -ne 0 ] || [ ! -s "$output_file" ]; then
|
|
echo "WARNING: No links extracted"
|
|
rm -f "$output_file"
|
|
return 1
|
|
fi
|
|
echo "Successfully extracted links to $output_file"
|
|
return 0
|
|
}
|
|
|
|
# Main script logic
|
|
if [ $# -ne 1 ]; then
|
|
echo "Usage: $0 <URL>" >&2
|
|
exit 1
|
|
fi
|
|
|
|
INITIAL_URL=$1
|
|
DOMAIN=$(echo "$INITIAL_URL" | awk -F[/:] '{print $4}')
|
|
BASE_URL="https://$DOMAIN"
|
|
ALL_PAGES_URL="$BASE_URL/index.php?title=Special:AllPages"
|
|
LINKSFILE="$DOMAIN/links$LINK_FILE_SUFFIX"
|
|
EXCLUDE_FILE="$DOMAIN/exclude.txt"
|
|
CONTENT_DIR="$DOMAIN/content"
|
|
|
|
mkdir -p "$DOMAIN"
|
|
mkdir -p "$CONTENT_DIR"
|
|
|
|
# Step 1: Collect links
|
|
if ! download_content "$ALL_PAGES_URL" "$EXCLUDE_FILE"; then
|
|
echo "Failed to download $ALL_PAGES_URL"
|
|
exit 1
|
|
fi
|
|
|
|
if ! extract_links "$LINKSFILE"; then
|
|
echo "Failed to extract links"
|
|
exit 1
|
|
fi
|
|
|
|
# Step 2: Process links
|
|
PROCESSED_URLS=()
|
|
while IFS= read -r link; do
|
|
URL=$(resolve_url "$BASE_URL" "$link")
|
|
|
|
if [[ " ${PROCESSED_URLS[*]} " =~ " $URL " ]]; then
|
|
echo "Skipping processed URL: $URL"
|
|
continue
|
|
fi
|
|
|
|
if ! download_content "$URL" "$EXCLUDE_FILE"; then
|
|
PROCESSED_URLS+=("$URL")
|
|
continue
|
|
fi
|
|
|
|
PAGENAME=$(extract_title)
|
|
[ -z "$PAGENAME" ] && PAGENAME="page"
|
|
|
|
TEXTFILE="$CONTENT_DIR/$PAGENAME$TEXT_FILE_SUFFIX"
|
|
|
|
if ! extract_text "$TEXTFILE" "$URL" "$EXCLUDE_FILE"; then
|
|
PROCESSED_URLS+=("$URL")
|
|
continue
|
|
fi
|
|
|
|
PROCESSED_URLS+=("$URL")
|
|
done < "$LINKSFILE"
|
|
|
|
echo "Processing complete."
|
|
exit 0
|
|
|