LondonScrapers/SCRAPE_MPaS.SH
2026-06-19 23:30:51 -04:00

48 lines
2.2 KiB
Bash
Executable File

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_mpas.html"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies"
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
cat "$SEARCH_PAGE" | sed 's/></>\n</g' | \
while IFS= read -r LINE; do
if (( LAST_LINE_SUMMARY )) && [[ "$LINE" == "<span>"* ]]; then
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<span>\([^<]*\)<\/span>.*/\1/')
echo $CURRENT
fi
LAST_LINE_SUMMARY=0
if echo "$LINE" | grep -q '<summary>'; then
LAST_LINE_SUMMARY=1
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<summary>\([^<]*\)<\/summary>.*/\1/')
echo $CURRENT
fi
if [[ "$LINE" == *".pdf"* ]]; then
FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: '
echo $FILE
mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/"
_utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")"
fi
done
fi