LondonScrapers/SCRAPE_GINV.SH
2026-06-19 23:30:51 -04:00

330 lines
17 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_TIMELINE="./tmp/custom_timeline_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolved.london.ca/projects"
set_metadata() {
tmp=$(echo "$1" | sed 's/&/\&/g' | sed 's/"//g' | sed 's/'/'\''/g' | sed 's/\[/''/g' | sed 's/\]/''/g')
PROJECT_NAME=$(_utils_fix_dashes "$(echo $tmp | sed 's/.*data-project-name="\([^"]*\)".*/\1/' | sed 's///g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g')")
PROJECT_CATS=$(echo "$tmp" | sed 's/.*data-project-category="\([^"]*\)".*/\1/')
PROJECT_LOCATION=$(echo "$tmp" | sed 's/.*data-project-location="\([^"]*\)".*/\1/')
}
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if (( FOUND_DATE )) && [[ "$LAST_LINE" == "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_TIMELINE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( NEXT_LINE_CONTENT )); then
# Next hive-block marks end of current item
if [[ "$LINE_PROJ" == *"hive-block"* ]] || [[ "$LINE_PROJ" == "" ]]; then
NEXT_LINE_CONTENT=0
echo "End of current content."
else
# Ignore boring notices
if [[ "$LINE_PROJ" != *"</h1>"* ]] &&
[[ "$LINE_PROJ" != *"City of London Land Acknowledgement"* ]] &&
[[ "$LINE_PROJ" != *"Ongoing Site Specific Planning Applications"* ]] &&
[[ "$LINE_PROJ" != *"This site is owned and operated by the City of London using software licensed from Social Pinpoint"* ]] &&
[[ "$LINE_PROJ" != *"Social Pinpoint has been commissioned by City of London (Canada) to collect and display user content on their behalf"* ]] &&
[[ "$LINE_PROJ" != *"Notice of Collection of Personal Information"* ]] &&
[[ "$LINE_PROJ" != *'href="/register"'* ]] &&
[[ "$LINE_PROJ" != *'href="/login"'* ]] &&
[[ "$LINE_PROJ" != *"Users have the right to access, correct, or delete their personal information"* ]] &&
[[ "$LINE_PROJ" != *"This privacy policy may change from time to time"* ]] &&
#[[ "$LINE_PROJ" != *"Share your feedback"* ]] &&
[[ "$LINE_PROJ" != *"Notice of Collection"* ]] &&
#[[ "$LINE_PROJ" != *"Subscribe for project updates"* ]] &&
[[ "$LINE_PROJ" != *"Ready to have your say?"* ]]; then
# seds to replace youtube iframe with a normal <a href=""> link. wkhtmltopdf obviously can't embed youtube videos.
if (( FIRST_CONTENT )); then
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
fi
fi
fi
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/download_file/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- end foreach -->"* ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_FAQ_BLOCK=0
echo "End of current FAQ."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<script>"* ]]; then
IS_PROFILE_BLOCK=0
echo "End of current profile."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_TIMELINE_BLOCK )); then
if [[ "$LINE_PROJ" == *"btn-unfill btn-primary"* ]]; then
IS_TIMELINE_BLOCK=0
echo "End of current timeline."
else
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_KEYDATES_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_KEYDATES_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SLIDER_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *"hive-block hive-block-content ljs"* ]]; then
NEXT_LINE_CONTENT=1
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
elif [[ "$LINE_PROJ" == *"docLibModal hive-block-document-library"* ]]; then
IS_DOC_BLOCK=1
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
echo "Found documents start."
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
IS_PHOTO_BLOCK=1
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
echo "Found photos start."
elif [[ "$LINE_PROJ" == *"hive-modal faqModal hive-block-faq"* ]]; then
IS_FAQ_BLOCK=1
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
echo "Found FAQ start."
elif [[ "$LINE_PROJ" == *"hive-block-bio hive-block"* ]]; then
IS_PROFILE_BLOCK=1
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
echo "Found profile start."
elif [[ "$LINE_PROJ" == *"hive-block-timeline hive-block"* ]]; then
IS_TIMELINE_BLOCK=1
echo "<!-- LondonArchive_GINV_Timeline -->" >> $FULLDUMP
echo "Found timeline start."
elif [[ "$LINE_PROJ" == *"hive-modal dateModal"* ]]; then
IS_KEYDATES_BLOCK=1
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
echo "Found key dates start."
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
IS_SLIDER_BLOCK=1
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
echo "Found slider start."
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
IS_SINGLE_IMAGE_BLOCK=1
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
echo "Found single image start."
fi
done < $PROJECT_PAGE
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
#cat "$CUSTOM_HTML_LINKS" >> "$CUSTOM_HTML"
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"download_file"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(curl -s -L -I "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" | grep "location:" | sed 's/location: //' | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//' | sed 's/\.pdf./\.pdf/')
echo $DOC_NAME
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_LINKS
fi
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"amazonaws"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
echo $DOC_NAME
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_PHOTOS
fi
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
cat ./template/default_getinvolved.html > $WORK_HTML
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
fi
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
fi
if (( NEXT_LINE_URL )); then
NEXT_LINE_URL=0
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
fi
if [[ "$LAST_LINE" != "" ]]; then
set_metadata "$LAST_LINE$LINE"
LAST_LINE=""
NEXT_LINE_URL=1
elif [[ "$LINE" == *"h-entry project card"* ]] && [[ "$LINE" == *"data-project-name"* ]] && [[ "$LINE" != *"<%-"* ]]; then
#echo $LINE
if [[ "$LINE" != *"data-project-category"* ]]; then
# Sometimes lines are split, so we'll combine the pieces over time.
LAST_LINE=$LINE
echo "Line is split!"
else
LAST_LINE=""
set_metadata "$LINE"
NEXT_LINE_URL=1
fi
elif [[ "$LINE" == *'time class="dt-updated"'* ]]; then
PROJECT_DATE=$(echo $LINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g')
echo $PROJECT_DATE
_time_parse_monddyyyy "$PROJECT_DATE"
echo "$ITEM_YEAR$ITEM_MONTH$ITEM_DAY"
FOUND_DATE=1
fi
done < $SEARCH_PAGE
fi