#!/usr/bin/env bash echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-" echo -e "-=- -=-" echo -e "-=- Lillian Skinner -=-" echo -e "-=- -=-" echo -e "-========================================================================-" source ./functions/.functions # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" TEMP_DIR="./tmp/" SEARCH_PAGE="./tmp/index_mpas.html" rm -f $SEARCH_PAGE mkdir $TEMP_DIR SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies" wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress if [ $? -ne 8 ]; then cat "$SEARCH_PAGE" | sed 's/>\n"* ]]; then CURRENT=$(echo "$LINE" | sed 's/>\n\([^<]*\)<\/span>.*/\1/') echo $CURRENT fi LAST_LINE_SUMMARY=0 if echo "$LINE" | grep -q ''; then LAST_LINE_SUMMARY=1 CURRENT=$(echo "$LINE" | sed 's/>\n\([^<]*\)<\/summary>.*/\1/') echo $CURRENT fi if [[ "$LINE" == *".pdf"* ]]; then FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: ' echo $FILE mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/" _utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")" fi done fi