#!/usr/bin/env bash echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-" echo -e "-=- -=-" echo -e "-=- Lillian Skinner -=-" echo -e "-=- -=-" echo -e "-========================================================================-" source ./functions/.functions # Todo: # - Save updates (see bradley-ave) # - Order, title, and collapse each scraped modal # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" TEMP_DIR="./tmp/" SEARCH_PAGE="./tmp/index_ginv.html" PROJECT_PAGE="./tmp/project_ginv.html" WORK_HTML="./tmp/tmp.html" CUSTOM_HTML="./tmp/custom_ginv.html" CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html" CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html" CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html" CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html" CUSTOM_HTML_TIMELINE="./tmp/custom_timeline_ginv.html" CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html" CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html" FULLDUMP="./tmp/.fulldump.txt" current_year=$(date +%Y) current_month=$(date +%m) current_day=$(date +%d) rm -f $SEARCH_PAGE mkdir $TEMP_DIR SEARCH_URL="https://getinvolved.london.ca/projects" set_metadata() { tmp=$(echo "$1" | sed 's/&/\&/g' | sed 's/"//g' | sed 's/'/'\''/g' | sed 's/\[/''/g' | sed 's/\]/''/g') PROJECT_NAME=$(_utils_fix_dashes "$(echo $tmp | sed 's/.*data-project-name="\([^"]*\)".*/\1/' | sed 's/‘//g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g')") PROJECT_CATS=$(echo "$tmp" | sed 's/.*data-project-category="\([^"]*\)".*/\1/') PROJECT_LOCATION=$(echo "$tmp" | sed 's/.*data-project-location="\([^"]*\)".*/\1/') } wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress if [ $? -ne 8 ]; then while IFS= read -r LINE; do if (( FOUND_DATE )) && [[ "$LAST_LINE" == "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then FOUND_DATE=0 echo $PROJECT_URL echo $PROJECT_NAME wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress # Now we can work on the actual project page. rm -f $CUSTOM_HTML_LINKS rm -f $CUSTOM_HTML_PHOTOS rm -f $CUSTOM_HTML_FAQ rm -f $CUSTOM_HTML_PROFILE rm -f $CUSTOM_HTML_TIMELINE rm -f $CUSTOM_HTML_KEYDATES rm -f $CUSTOM_HTML_SLIDER rm -f $FULLDUMP cat ./template/default_getinvolved.html > $CUSTOM_HTML echo "