LondonScrapers/SCRAPE_GINV_OLD.SH
2026-06-19 23:30:51 -04:00

302 lines
15 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if [[ "$PROJECT_NAME" != "" ]]; then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_UPDATE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == "</ul>" ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
echo $LINE_PROJ
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"div class='clearfix'></div"* ]]; then
IS_FAQ_BLOCK=0
echo "End of current FAQ."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!--[if IE]>"* ]]; then
IS_PROFILE_BLOCK=0
echo "End of current profile."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_UPDATE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<div class='clearfix'></div>"* ]]; then
IS_UPDATE_BLOCK=0
echo "End of current update."
else
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_KEYDATES_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_KEYDATES_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SLIDER_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
if (( FIRST_CONTENT )); then
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
elif [[ "$LINE_PROJ" == *"widget-wrap widget_document_library"* ]]; then
IS_DOC_BLOCK=1
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
echo "Found documents start."
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
IS_PHOTO_BLOCK=1
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
echo "Found photos start."
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_recent_photos'"* ]]; then
IS_FAQ_BLOCK=1
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
echo "Found FAQ start."
elif [[ "$LINE_PROJ" == *"widget-wrap widget_project_team"* ]]; then
IS_PROFILE_BLOCK=1
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
echo "Found profile start."
elif [[ "$LINE_PROJ" == *"<div class='fr-view'>"* ]]; then
IS_UPDATE_BLOCK=1
echo "<!-- LondonArchive_GINV_Update -->" >> $FULLDUMP
echo "<h1>Project Updates</h1>" >> $CUSTOM_HTML_UPDATE
echo "Found update start."
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_life_cycle'"* ]]; then
IS_KEYDATES_BLOCK=1
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
echo "Found key dates start."
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
IS_SLIDER_BLOCK=1
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
echo "Found slider start."
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
IS_SINGLE_IMAGE_BLOCK=1
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
echo "Found single image start."
fi
done < $PROJECT_PAGE
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
cat "$CUSTOM_HTML_LINKS" # >> "$CUSTOM_HTML"
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"/documents/"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME="$(echo $LINE_DOC | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/g' | sed 's/ (pdf)//' | sed 's/^ +| +$//g').pdf"
echo "-------- "$DOC_NAME
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')/download" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_LINKS
fi
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"ehq-production"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
echo "======== "$DOC_NAME
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_PHOTOS
fi
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
cat ./template/default_getinvolved.html > $WORK_HTML
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
fi
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
IS_DOC_BLOCK=0
IS_PHOTO_BLOCK=0
IS_FAQ_BLOCK=0
IS_PROFILE_BLOCK=0
IS_UPDATE_BLOCK=0
IS_KEYDATES_BLOCK=0
IS_SLIDER_BLOCK=0
IS_SINGLE_IMAGE_BLOCK=0
PROJECT_NAME=""
fi
if (( NEXT_LINE_CONT_NAME )); then
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LAST_LINE$LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's///g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/&quot;//g' | sed 's/&amp;/and/g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
NEXT_LINE_CONT_NAME=0
echo $PROJECT_NAME
fi
if [[ "$LINE" == *"project-tile__meta__name"* ]]; then
if [[ "$LINE" != *"</span"* ]]; then
NEXT_LINE_CONT_NAME=1
LAST_LINE=$LINE
else
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's///g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/&quot;//g' | sed 's/&amp;/and/g' | sed 's/&#39;//g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
echo $PROJECT_NAME
fi
fi
if [[ "$LINE" == *"project-tile__link"* ]]; then
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
PROJECT_URL=$(echo $SEARCH_URL$PROJECT_URL)
echo " "$PROJECT_URL
# Reset project name to mark the start of a new project
PROJECT_NAME=""
fi
done < $SEARCH_PAGE
fi